External data

# Table of contents -------------------------------------------------------
# 1. vdemdata CHECK
# 2. Rainbowmap CHECK
# 3. Economist Democracy Scores for 2018 CHECK
# 4. Happiness CHECK
# 5. GDP per capita CHECK
# 6. ESS round 9 CHECK
# 7. Unemployment rate CHECK
# 8. Gender equality index CHECK  


## further ideas:
# which political parties governed in the years before (i.e., left, centre-right etc.)
# Gini index
# migrant acceptance scores
# welfare state/ size of government relative to GDP
# physical/ mental health issues; quality and equity of healthcare
# solidarity with minority groups
# education


# load libraries ----------------------------------------------------------
# devtools::install_github("vdeminstitute/vdemdata")

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
library(readxl)
library(vdemdata)
library(survey) 
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(countrycode)
library(rvest)
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(stringr)
library(mice)       
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(VIM)
## Loading required package: colorspace
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
library(visdat)    
library(naniar)     
library(caret)      
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:survival':
## 
##     cluster
## 
## The following object is masked from 'package:purrr':
## 
##     lift
# mapping -----------------------------------------------------------------
# we comment out the country codes as we likely don't need them
survey_country_mapping <- data.frame(
  country_name = c("France", "Belgium", "Netherlands", "Germany", "Italy", 
                   "Luxembourg", "Denmark", "Ireland", "United Kingdom", "Greece", 
                   "Spain", "Portugal", "Finland", "Sweden", 
                   "Austria", "Cyprus", "Czech Republic", "Estonia", "Hungary", 
                   "Latvia", "Lithuania", "Malta", "Poland", "Slovakia", 
                   "Slovenia", "Bulgaria", "Romania", "Croatia"),
  #survey_country_code = c(1, 2, 3, 4, 5, 
  #                        6, 7, 8, 9, 11, 
  #                        12, 13, 16, 17, 
  #                        18, 19, 20, 21, 22, 
  #                        23, 24, 25, 26, 27, 
  #                        28, 29, 30, 32),
  iso2 = c("FR", "BE", "NL", "DE", "IT", 
           "LU", "DK", "IE", "GB", "GR", 
           "ES", "PT", "FI", "SE", 
           "AT", "CY", "CZ", "EE", "HU", 
           "LV", "LT", "MT", "PL", "SK", 
           "SI", "BG", "RO", "HR"),
  stringsAsFactors = FALSE)

# create a mapping for country name standardization: the Czech Republic is a problem
country_name_mapping <- c("Czechia" = "Czech Republic")


# 1. vdemdata ----------------------------------------------------------------
# install the package from GitHub first:
# devtools::install_github("vdeminstitute/vdemdata")
vdem_data <- vdem

# apply country name standardization before filtering
vdem_data_standardized <- vdem_data %>%
  mutate(
    # Standardize country names
    country_name = ifelse(country_name %in% names(country_name_mapping),
                          country_name_mapping[country_name],
                          country_name))

# filter for most recent data (2019 to match survey data)
vdem_2019 <- vdem_data_standardized %>%
  filter(country_name %in% survey_country_mapping$country_name, year == 2019) %>%
  select(country_name, v2x_libdem, v2x_polyarchy, v2x_gender, 
         v2x_egaldem, v2x_liberal, v2xcs_ccsi, v2x_freexp)  # select relevant variables

saveRDS(vdem_2019, file = "vdem_eu_2019.rds")
write.csv(vdem_2019, "vdem_eu_2019.csv")

## from the codebook: 
# v2x_libdem: index of liberal democracy
# v2x_polyarchy: index of electoral democracy
# v2x_gender: index of women's political empowerment
# v2x_egaldem: index of egalitarian democracy   
# v2x_liberal: index of civil liberties
# v2xcs_ccsi:  index of civil society participation
# v2x_freexp: index of freedom of expression



# 2. Rainbowmap --------------------------------------------------------------
# https://rainbowmap.ilga-europe.org/

# rainbow_data <- read_csv("data/raw/2024-rainbow-map-data.csv")

# but the problem: that's for 2024, not 2019 or before 2019
# hence, we would need to get the data for 2019 and years before:

# create data frames for each year
df_2019 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Norway", "France", "Finland",
              "Denmark", "Spain", "Portugal", "Sweden", "Netherlands", "Austria",
              "Germany", "Croatia", "Greece", "Ireland", "Hungary", "Luxembourg",
              "Iceland", "Slovenia", "Montenegro", "Estonia", "Switzerland", "Georgia",
              "Bosnia & Herzegovina", "Slovakia", "Albania", "Serbia", "Bulgaria", 
              "Czechia", "Kosovo", "Andorra", "Cyprus", "Romania", "Ukraine", 
              "Lithuania", "Italy", "Poland", "Latvia", "Belarus", "Moldova", 
              "Russia", "North Macedonia", "Liechtenstein", "San Marino", "Armenia",
              "Turkey", "Monaco", "Azerbaijan"),
  Value = c(74.72, 70.40, 67.62, 63.64, 62.73, 62.20, 60.31, 58.49, 57.50, 52.86,
            49.72, 48.54, 47.45, 45.83, 44.82, 44.72, 42.83, 41.34, 40.20, 39.82, 
            37.16, 34.38, 30.06, 29.49, 29.37, 29.04, 27.74, 26.43, 25.75, 25.50,
            25.34, 23.93, 22.03, 20.67, 19.97, 19.91, 19.43, 18.10, 16.83, 15.82,
            15.10, 11.75, 10.88, 10.08, 9.87, 8.50, 8.50, 7.31, 5.67),
  Year = 2019)

df_2018 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Finland", "France", "Norway",
              "Portugal", "Denmark", "Spain", "Sweden", "Netherlands", "Germany",
              "Austria", "Greece", "Ireland", "Croatia", "Slovenia", "Luxembourg",
              "Iceland", "Hungary", "Estonia", "Switzerland", "Montenegro", "Andorra",
              "Albania", "Kosovo", "Bosnia & Herzegovina", "Serbia", "Czechia", 
              "Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Lithuania",
              "Romania", "Ukraine", "Poland", "Liechtenstein", "Latvia", 
              "North Macedonia", "Belarus", "Moldova", "San Marino", "Russia", 
              "Monaco", "Turkey", "Armenia", "Azerbaijan"),
  Value = c(91.94, 78.70, 73.48, 73.27, 72.81, 72.74, 69.16, 67.69, 67.03, 60.10,
            59.64, 59.00, 56.40, 52.32, 52.22, 50.58, 47.73, 47.48, 47.22, 47.16,
            39.34, 38.44, 37.74, 34.81, 33.24, 32.98, 31.38, 29.68, 29.20, 28.95,
            28.65, 28.82, 25.87, 24.15, 21.78, 21.12, 20.95, 18.23, 17.87, 16.07,
            14.03, 13.35, 13.08, 12.32, 10.80, 9.78, 8.60, 7.20, 4.70),
  Year = 2018)

df_2017 <- data.frame(
  Country = c("Malta", "Norway", "United Kingdom", "Belgium", "France", "Portugal",
              "Finland", "Denmark", "Spain", "Netherlands", "Croatia", "Sweden", 
              "Austria", "Germany", "Ireland", "Iceland", "Greece", "Luxembourg", 
              "Hungary", "Slovenia", "Montenegro", "Andorra", "Estonia", "Albania",
              "Bosnia & Herzegovina", "Switzerland", "Kosovo", "Serbia", "Czechia", 
              "Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Romania", 
              "Ukraine", "Poland", "Liechtenstein", "Lithuania", "Latvia", 
              "North Macedonia", "Belarus", "Moldova", "San Marino", "Monaco", 
              "Turkey", "Armenia", "Azerbaijan"),
  Value = c(77.74, 75.73, 71.86, 70.82, 69.16, 68.27, 67.69, 67.03, 64.44, 62.36,
            60.10, 55.58, 54.41, 52.22, 47.22, 46.92, 46.48, 44.82, 44.28, 38.64, 
            34.81, 33.31, 33.24, 31.34, 30.94, 30.48, 29.68, 29.20, 28.95, 27.60, 
            26.67, 25.87, 23.15, 21.12, 19.00, 18.23, 17.87, 17.28, 17.12, 16.03, 
            13.35, 13.08, 12.32, 9.78, 8.60, 7.20, 6.40, 4.70),
  Year = 2017)

df_2016 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Spain", "Denmark", "Portugal",
              "Finland", "France", "Croatia", "Netherlands", "Norway", "Sweden", 
              "Austria", "Iceland", "Greece", "Germany", "Ireland", "Hungary", 
              "Luxembourg", "Montenegro", "Estonia", "Albania", "Switzerland", 
              "Andorra", "Serbia", "Cyprus", "Slovenia", "Czechia", "Kosovo", 
              "Georgia", "Bosnia & Herzegovina", "Slovakia", "Bulgaria", "Romania",
              "Italy", "Poland", "Liechtenstein", "Lithuania", "Latvia", 
              "North Macedonia", "San Marino", "Moldova", "Belarus", "Monaco", 
              "Turkey", "Armenia", "Azerbaijan"),
  Value = c(77.75, 81.85, 79.19, 70.95, 70.90, 69.55, 67.25, 66.60, 66.55, 66.10, 
            65.15, 64.85, 62.21, 59.00, 58.30, 55.14, 54.70, 51.40, 50.35, 45.20, 
            38.25, 34.40, 33.15, 32.10, 32.00, 31.95, 31.65, 31.60, 31.55, 30.35, 
            29.40, 29.20, 24.00, 23.45, 19.75, 18.30, 18.20, 18.10, 17.35, 15.55, 
            14.40, 14.15, 13.35, 10.80, 8.75, 7.20, 4.85),
  Year = 2016)

df_2015 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Malta", "Sweden", "Croatia", "Netherlands",
              "Norway", "Spain", "Denmark", "Portugal", "France", "Iceland", "Finland", 
              "Germany", "Austria", "Hungary", "Montenegro", "Luxembourg", "Albania",
              "Ireland", "Greece", "Georgia", "Czechia", "Estonia", "Slovenia",
              "Andorra", "Bosnia & Herzegovina", "Serbia", "Slovakia", "Romania",
              "Switzerland", "Bulgaria", "Poland", "Italy", "Liechtenstein", 
              "Lithuania", "Cyprus", "Kosovo", "Latvia", "Moldova", "Belarus",
              "San Marino", "North Macedonia", "Turkey", "Monaco", "Armenia", 
              "Azerbaijan"),
  Value = c(88.00, 83.00, 77.00, 72.00, 71.00, 69.00, 69.00, 69.00, 68.00, 67.00,
            65.00, 63.00, 62.00, 56.00, 52.00, 50.00, 46.00, 43.00, 42.00, 40.00,
            39.00, 36.00, 35.00, 34.00, 32.00, 31.00, 29.00, 29.00, 29.00, 28.00, 
            28.00, 27.00, 26.00, 22.00, 19.00, 19.00, 18.00, 18.00, 18.00, 16.00,
            14.00, 14.00, 13.00, 12.00, 11.00, 9.00, 5.00),
  Year = 2015)

df_2014 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Spain", "Netherlands", "Norway", 
              "Portugal", "Sweden", "France", "Iceland", "Denmark", "Malta", 
              "Croatia", "Germany", "Hungary", "Austria", "Montenegro", "Finland",
              "Albania", "Slovenia", "Czechia", "Estonia", "Ireland", "Greece", 
              "Slovakia", "Serbia", "Bulgaria", "Switzerland", "Luxembourg", 
              "Romania", "Poland", "Italy", "Georgia", "Lithuania", "Andorra",
              "Bosnia & Herzegovina", "Cyprus", "Latvia", "Liechtenstein", "Kosovo",
              "Moldova", "Turkey", "San Marino", "Belarus", "North Macedonia", 
              "Ukraine", "Monaco", "Armenia", "Azerbaijan"),
  Value = c(80.25, 78.10, 73.26, 69.90, 68.40, 66.60, 65.30, 64.10, 63.95, 59.90,
            56.80, 56.30, 55.68, 53.65, 52.10, 47.05, 45.30, 38.40, 35.00, 34.65, 
            34.65, 33.65, 31.15, 30.50, 30.30, 30.00, 28.85, 28.35, 27.95, 27.65, 
            27.40, 28.05, 21.70, 20.60, 20.10, 19.65, 19.65, 18.00, 17.10, 16.50,
            14.15, 13.70, 13.60, 13.30, 11.65, 10.10, 8.50, 6.60),
  Year = 2014)

df_2013 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Norway", "Sweden", "Spain", "Portugal", 
              "France", "Netherlands", "Denmark", "Iceland", "Hungary", "Germany",
              "Croatia", "Finland", "Austria", "Albania", "Malta", "Slovenia", 
              "Czechia", "Ireland", "Romania", "Estonia", "Switzerland", "Luxembourg",
              "Greece", "Slovakia", "Montenegro", "Serbia", "Poland", "Georgia",
              "Lithuania", "Andorra", "Bosnia & Herzegovina", "Cyprus", "Latvia", 
              "Italy", "Bulgaria", "Liechtenstein", "Turkey", "San Marino", "Belarus",
              "Kosovo", "North Macedonia", "Ukraine", "Monaco", "Armenia", "Azerbaijan"),
  Value = c(78.50, 68.73, 65.65, 65.30, 65.04, 64.60, 64.10, 60.00, 59.80, 55.50,
            54.70, 54.29, 48.30, 47.25, 43.35, 38.40, 35.30, 35.00, 34.65, 33.65, 
            31.30, 28.90, 28.85, 28.35, 28.10, 28.90, 28.65, 25.05, 21.65, 21.05, 
            20.70, 20.60, 19.95, 19.65, 19.65, 19.40, 18.00, 15.50, 14.15, 13.70, 
            13.60, 13.50, 13.30, 11.65, 10.10, 7.50, 7.10),
  Year = 2013)

# combine all data frames into one
df_combined <- bind_rows(df_2019, df_2018, df_2017, df_2016, df_2015, df_2014, df_2013)

# create new, compressed df
# step 1: Filter data for 2019 and 2018
df_2019 <- df_combined %>% filter(Year == 2019) %>% rename(Value_2019 = Value)
df_2018 <- df_combined %>% filter(Year == 2018) %>% rename(Value_2018 = Value)

# step 2: Filter data for 2013 and 2014 and calculate the average
df_2013_2014 <- df_combined %>% filter(Year %in% c(2013, 2014)) %>%
  group_by(Country) %>%
  summarise(Avg_2013_2014 = mean(Value, na.rm = TRUE))

# step 3: Join the data frames for 2019 and 2018
df_compressed <- df_2019 %>%
  left_join(df_2018, by = "Country") %>%
  select(Country, Value_2019, Value_2018)

# step 4: Calculate the average for 2019 and 2018
df_compressed <- df_compressed %>%
  mutate(Avg_2019_2018 = (Value_2019 + Value_2018) / 2)

# step 5: Join the average for 2013 and 2014
df_compressed <- df_compressed %>%
  left_join(df_2013_2014, by = "Country")

# step 6: Calculate the difference between the averages
df_compressed <- df_compressed %>%
  mutate(Difference = Avg_2019_2018 - Avg_2013_2014)

# step 7: Select and reorder columns for the final compressed data frame
df_compressed <- df_compressed %>%
  select(Country, Value_2019, Value_2018, Avg_2019_2018, 
         Avg_2013_2014, Difference)

rainbow_df <- df_compressed

# save the data frame
saveRDS(rainbow_df, file = "rainbow_df.rds")
write.csv(rainbow_df, "rainbow_df.csv")



# 3. The Economist: Democracy scores 2018 ---------------------------------
# https://enperspectiva.uy/wp-content/uploads/2019/01/Democracy_Index_2018.pdf
democracy_scores <- data.frame(
  Country = c("Belgium", "Denmark", "Greece", "Spain", "Finland", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
  ISO2 = c("BE", "DK", "GR", "ES", "FI", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
  Overall_score = c(7.78, 9.22, 7.29, 8.08, 9.14, 7.80, 9.15, 7.71, 8.81, 8.89, 8.29, 7.84, 9.39, 8.68, 8.53, 7.03, 7.59, 7.69, 7.97, 6.63, 7.38, 7.50, 8.21, 6.67, 6.38, 7.10, 7.50, 6.57),
  #Global_rank = c(31, 5, 39, 19, 8, 29, "6=", 33, 12, 11, 16, 27, 3, 13, 14, 46, 35, 34, "23=", "57=", 38, "36=", 18, "54=", "66=", 44, "36=", 60),
  #Regional_rank = c(17, 4, 20, 14, 6, 16, 5, 18, 9, 8, 12, 15, 3, 10, 11, 7, 19, 2, 1, 9, 5, "3=", 13, 8, 12, 6, "3=", 10),
  Electoral_process_and_pluralism = c(9.58, 10.00, 9.58, 9.17, 10.00, 9.58, 9.58, 9.58, 10.00, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.17, 9.17, 9.58, 9.58, 8.75, 9.58, 9.58, 9.17, 9.17, 9.17, 9.58, 9.58, 9.17),
  Functioning_of_government = c(8.93, 9.29, 5.36, 7.14, 8.93, 7.50, 7.86, 6.07, 8.93, 9.29, 7.86, 7.50, 9.64, 8.57, 7.50, 6.43, 6.43, 6.79, 8.21, 6.07, 6.07, 6.43, 8.21, 6.07, 5.71, 6.79, 6.79, 6.07),
  Political_participation = c(5.00, 8.33, 6.11, 7.78, 8.33, 7.78, 8.33, 7.78, 6.67, 8.33, 8.33, 6.11, 8.33, 8.33, 8.33, 7.22, 6.67, 6.67, 6.67, 5.00, 5.56, 6.11, 6.11, 6.11, 5.00, 5.56, 6.67, 5.56),
  Political_culture = c(6.88, 9.38, 6.88, 7.50, 8.75, 5.63, 10.00, 6.88, 8.75, 8.13, 6.88, 6.88, 10.00, 7.50, 8.13, 4.38, 6.88, 6.88, 6.88, 6.25, 6.88, 6.25, 8.75, 4.38, 4.38, 5.63, 6.25, 5.00),
  Civil_liberties = c(8.53, 9.12, 8.53, 8.82, 9.71, 8.53, 10.00, 8.24, 9.71, 9.12, 8.82, 9.12, 9.41, 9.41, 9.12, 7.94, 8.82, 8.53, 8.53, 7.06, 8.82, 9.12, 8.82, 7.65, 7.65, 7.94, 8.24, 7.06),
  Regime_type = c("Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy"))

saveRDS(democracy_scores, file = "democracy_scores.rds")
write.csv(democracy_scores, "democracy_scores.csv")


# 4. Happiness data 2018 ---------------------------------------------------------------------
# https://s3.amazonaws.com/happiness-report/2018/WHR_web.pdf
happiness_scores <- data.frame(
  Country = c("Finland", "Denmark", "Greece", "Spain", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
  ISO2 = c("FI", "DK", "GR", "ES", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
  Happiness_Score = c(7.632, 7.555, 5.358, 6.310, 6.489, 6.977, 6.000, 6.910, 7.441, 7.139, 5.410, 7.314, 6.965, 6.814, 4.933, 5.762, 6.711, 5.739, 5.620, 5.933, 5.952, 6.627, 6.123, 5.945, 6.173, 5.948, 5.321))

saveRDS(happiness_scores, file = "happiness_scores.rds")
write.csv(happiness_scores, "happiness_scores.csv")


# 5. GDP per capita ---------------------------------------------------------------------
df_GDP <- read_csv("data/raw/data_20250228194704.csv")
## New names:
## Rows: 1064 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): IndicatorName, VariableName, MeasurementName, CountryCode, Alpha3Co... dbl
## (2): IndicatorCode, PeriodCode lgl (1): ...10
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...10`
# apply the mapping to the CountryName column
df_GDP <- df_GDP %>%
  mutate(
    # replace Czechia with Czech Republic
    CountryName = ifelse(CountryName %in% names(country_name_mapping), 
                         country_name_mapping[CountryName], 
                         CountryName)) %>%
  select("CountryName", "PeriodCode", "Value") %>%
  # now filter after standardizing the names
  filter(CountryName %in% survey_country_mapping$country_name)

df_GDP <- df_GDP %>%
  mutate(Value = as.numeric(as.character(Value)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Value = as.numeric(as.character(Value))`.
## Caused by warning:
## ! NAs introduced by coercion
df_GDP <- df_GDP %>%
  # group by country
  group_by(CountryName) %>% 
  # find first and last year values
  summarize(
    gdp_2005 = Value[PeriodCode == 2005],
    gdp_2018 = Value[PeriodCode == 2018],
    # calculate relative growth
    gdp_growth = (gdp_2018 - gdp_2005) / gdp_2005 * 100) %>%
  # add ISO2 codes for easier joining with other datasets
  left_join(survey_country_mapping, by = c("CountryName" = "country_name")) %>%
  # select relevant columns
  select(CountryName, iso2, gdp_2005, gdp_2018, gdp_growth)

saveRDS(df_GDP, file = "df_GDP.rds")
write.csv(df_GDP, "df_GDP.csv")


# 6. ESS Round 9 ----------------------------------------------------------
# https://ess.sikt.no/en/datafile/b2b0bf39-176b-4eca-8d26-3c05ea83d2cb
ess_data <- read_csv("data/raw/ESS9e03_2.csv")
## Rows: 49519 Columns: 572
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): name, proddate, cntry, ctzshipd, cntbrthd, lnghom1, lnghom2, fbrn...
## dbl (562): essround, edition, idno, dweight, pspwght, pweight, anweight, nws...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# select interesting variables, country and weight variables
ess_selected <- ess_data %>%
  select(
    # identifiers and weights
    cntry,          # country code
    pspwght,        # post-stratification weight
    dweight,        # design weight
    
    # key variables of interest
    freehms,        # gays and lesbians free to live life as they wish
    lrscale,        # left-right political scale
    rlgdgr,         # how religious are you
    ipeqopt,        # important that people are treated equally
    atchctr,        # attachment to country
    eduyrs,         # years of education
    agea)            # age of respondent

# function to recode ESS special values (negative values are typically missing values)
recode_ess_missing <- function(x) {
  ifelse(x < 0, NA, x)
}

# clean the data
ess_clean <- ess_selected %>%
  # recode special values to NA
  mutate(across(c(freehms, lrscale, rlgdgr, ipeqopt, atchctr, eduyrs, agea), 
                recode_ess_missing)) %>%
  # create derived variables if needed
  mutate(
    # recode freehms to 0-1 scale (originally 1-5 where 1 = agree strongly)
    freehms_support = case_when(
      freehms %in% c(1, 2) ~ 1,  # agree and strongly agree
      freehms %in% c(3, 4, 5) ~ 0,  # neutral, disagree, strongly disagree
      TRUE ~ NA_real_),
    
    # create age groups
    age_group = case_when(
      agea < 35 ~ "18-34",
      agea < 55 ~ "35-54",
      TRUE ~ "55+"
    ),
    
    # standardise left-right scale to 0-1
    lrscale_std = (lrscale - 1) / 9,  # Original scale is 1-10
    
    # create high education indicator (above country median)
    high_educ = NA  # will fill this in after calculating country medians
  )

# calculate country median education for relative education measure
country_medians <- ess_clean %>%
  group_by(cntry) %>%
  summarize(median_educ = median(eduyrs, na.rm = TRUE))

# join back to main data and create high education indicator
ess_clean <- ess_clean %>%
  left_join(country_medians, by = "cntry") %>%
  mutate(high_educ = ifelse(eduyrs > median_educ, 1, 0))

# calculate weighted means by country
country_aggregates <- ess_clean %>%
  # group by country
  group_by(cntry) %>%
  # calculate weighted statistics
  summarize(
    # sample size
    n_respondents = n(),
    n_valid = sum(!is.na(freehms)),
    
    # weighted means
    pct_lgbt_support = weighted.mean(freehms_support, w = pspwght, na.rm = TRUE) * 100,
    mean_religiosity = weighted.mean(rlgdgr, w = pspwght, na.rm = TRUE),
    mean_left_right = weighted.mean(lrscale_std, w = pspwght, na.rm = TRUE),
    mean_equal_values = weighted.mean(ipeqopt, w = pspwght, na.rm = TRUE),
    mean_country_attach = weighted.mean(atchctr, w = pspwght, na.rm = TRUE),
    mean_eduyrs = weighted.mean(eduyrs, w = pspwght, na.rm = TRUE),
    mean_age = weighted.mean(agea, w = pspwght, na.rm = TRUE),
    
    # weighted proportions for categorical variables
    pct_young = weighted.mean(age_group == "18-34", w = pspwght, na.rm = TRUE) * 100,
    pct_high_educ = weighted.mean(high_educ, w = pspwght, na.rm = TRUE) * 100,
    
    # standard errors (for confidence intervals)
    se_lgbt_support = sd(freehms_support, na.rm = TRUE) / sqrt(sum(!is.na(freehms_support))),
    
    # missing data proportions
    pct_missing_lgbt = mean(is.na(freehms)) * 100)

# calculate cross-variable country indicators
country_indicators <- ess_clean %>%
  group_by(cntry) %>%
  summarize(
    # correlation between age and LGBT support within country
    age_lgbt_corr = cor(agea, freehms_support, use = "pairwise.complete.obs", method = "spearman"),
    
    # correlation between religiosity and LGBT support
    relig_lgbt_corr = cor(rlgdgr, freehms_support, use = "pairwise.complete.obs", method = "spearman"),
    
    # inequality in LGBT support (standard deviation)
    lgbt_support_inequality = sd(freehms_support, na.rm = TRUE),
    
    # education gradient in LGBT support (difference between high and low education)
    educ_gradient = weighted.mean(freehms_support[high_educ == 1], w = pspwght[high_educ == 1], na.rm = TRUE) - 
      weighted.mean(freehms_support[high_educ == 0], w = pspwght[high_educ == 0], na.rm = TRUE))

# join the aggregates and indicators
country_data_final <- country_aggregates %>%
  left_join(country_indicators, by = "cntry") %>%
  # create ISO country codes for easier merging with other datasets
  mutate(
    iso2c = countrycode(cntry, "iso2c", "iso2c"),
    iso3c = countrycode(cntry, "iso2c", "iso3c"))

# plot LGBT support by country
ggplot(country_data_final, aes(x = reorder(cntry, pct_lgbt_support), y = pct_lgbt_support)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_errorbar(aes(ymin = pct_lgbt_support - 1.96*se_lgbt_support, 
                    ymax = pct_lgbt_support + 1.96*se_lgbt_support), 
                width = 0.2) +
  labs(title = "Support for LGBT Rights by Country",
       subtitle = "Percent agreeing gays and lesbians should be free to live as they wish",
       x = "Country",
       y = "Support (%)") +
  theme_minimal() +
  coord_flip()

# examine relationship between religiosity and LGBT support
ggplot(country_data_final, aes(x = mean_religiosity, y = pct_lgbt_support, label = cntry)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_text(hjust = -0.3, vjust = 0.3) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Religiosity vs. LGBT Support by Country",
       x = "Mean Religiosity Score",
       y = "LGBT Support (%)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

saveRDS(country_data_final, file = "country_data_final.rds")
write.csv(country_data_final, "country_data_final.csv")



# 7. Unemployment rate ----------------------------------------------------
# https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate

# scrape data from Wikipedia
url <- "https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate"
page <- read_html(url)
tables <- html_table(page, fill = TRUE)
eu_unemployment_table <- tables[[1]]

# clean column names - simplify them to more standard names
colnames(eu_unemployment_table) <- c("Country", "Unemployment", "Employment", "Year")

# clean up the country names by removing footnote references
eu_unemployment_table$Country <- gsub("\\[.*?\\]", "", eu_unemployment_table$Country)

# make sure numeric columns are properly formatted
eu_unemployment_table$Unemployment <- as.character(eu_unemployment_table$Unemployment)
eu_unemployment_table$Employment <- as.numeric(as.character(eu_unemployment_table$Employment))
eu_unemployment_table$Year <- as.numeric(as.character(eu_unemployment_table$Year))

eu_unemployment_table <- eu_unemployment_table %>%
  mutate(
    # trim spaces from country names
    Country = trimws(Country),
    # convert Unemployment to numeric (remove any % signs or spaces if present)
    Unemployment = as.numeric(gsub("[^0-9.]", "", Unemployment)))

# modify Greece's unemployment rate to the 2018 value
eu_unemployment_table$Unemployment[eu_unemployment_table$Country == "Greece"] <- 19.18
eu_unemployment_table$Year[eu_unemployment_table$Country == "Greece"] <- 2018

# add the UK data manually
uk_data <- data.frame(
  Country = "United Kingdom",
  Unemployment = 4.12, # from Statista
  Employment = 75.25,  # estimated from https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/articles/singlemonthlabourforcesurveyestimates/december2018
  Year = 2018)

# append UK data to the table
eu_unemployment_table <- rbind(eu_unemployment_table, uk_data)

# ignore the year column
eu_unemployment_table <- eu_unemployment_table %>%
  select(-Year)

# save
saveRDS(eu_unemployment_table, file = "eu_unemployment_table.rds")
write.csv(eu_unemployment_table, "eu_unemployment_table.csv", row.names = FALSE)


# 8. Gender equality ------------------------------------------------------
# https://eige.europa.eu/gender-statistics/dgs/indicator/index__index_scores/datatable?time=2017&col=domain&row=geo
gender_eq_index <- read_xlsx("data/raw/index__index_scores.xlsx", range = "A16:V44")

gender_eq_index <- gender_eq_index %>%
  select(country_name = "Geographic region\\(Sub-) Domain Scores", 
         gender_equality_index = "Overall Gender Equality Index") %>%
  mutate(country_name = ifelse(country_name == "Czechia", "Czech Republic", country_name))

# Merge the data ----------------------------------------------------------
# create a base dataframe with country identifying variables
country_level_df <- survey_country_mapping

# merge V-Dem data
country_level_df <- country_level_df %>%
  left_join(vdem_2019, by = c("country_name" = "country_name"))

# fix country name in democracy_scores for Czech Republic if needed
if(any(democracy_scores$Country == "Czechia")) {
  democracy_scores$Country[democracy_scores$Country == "Czechia"] <- "Czech Republic"
}

# merge democracy scores
country_level_df <- country_level_df %>%
  left_join(democracy_scores, by = c("iso2" = "ISO2"))

# merge GDP data
country_level_df <- country_level_df %>%
  left_join(df_GDP, by = "iso2")

# rename some countries to match our country_name format
rainbow_country_mapping <- data.frame(
  original = c("Czechia", "Andorra", "Bosnia & Herzegovina", "North Macedonia", "United Kingdom"),
  standardized = c("Czech Republic", "Andorra", "Bosnia and Herzegovina", "Macedonia", "United Kingdom"),
  stringsAsFactors = FALSE)

# apply standardized country names
for(i in 1:nrow(rainbow_country_mapping)) {
  rainbow_df$Country[rainbow_df$Country == rainbow_country_mapping$original[i]] <- 
    rainbow_country_mapping$standardized[i]
}

# extract and rename rainbow map variables for clarity
rainbow_data_clean <- rainbow_df %>%
  select(
    Country,
    rainbow_score_2019 = Value_2019,
    rainbow_score_2018 = Value_2018,
    rainbow_score_avg_2019_2018 = Avg_2019_2018,
    rainbow_score_avg_2013_2014 = Avg_2013_2014,
    rainbow_score_difference = Difference)

# merge Rainbow Map data
country_level_df <- country_level_df %>%
  left_join(rainbow_data_clean, by = c("country_name" = "Country"))

# merge happiness data
country_level_df <- country_level_df %>%
  left_join(happiness_scores, by = c("iso2" = "ISO2"))

# merge unemployment data
country_level_df <- country_level_df %>%
  left_join(eu_unemployment_table, by = c("country_name" = "Country"))

# create a mapping between ESS country codes and ISO2
ess_country_mapping <- data.frame(
  cntry = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR", 
            "DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO", 
            "SK", "SI", "ES", "SE", "GB"),
  iso2 = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR", 
           "DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO", 
           "SK", "SI", "ES", "SE", "GB"),
  stringsAsFactors = FALSE)

# first ensure cntry codes match our iso2 codes
country_data_final <- country_data_final %>%
  left_join(ess_country_mapping, by = "cntry") %>%
  select(-iso2c, -iso3c) # remove original ISO codes to avoid confusion

# rename variables for clarity
ess_data_clean <- country_data_final %>%
  select(
    cntry,
    n_respondents,
    n_valid,
    lgbt_support_percent = pct_lgbt_support,
    mean_religiosity,
    mean_left_right,
    mean_equal_values,
    mean_country_attach,
    mean_eduyrs,
    mean_age,
    pct_young,
    pct_high_educ,
    se_lgbt_support,
    pct_missing_lgbt,
    age_lgbt_corr,
    relig_lgbt_corr,
    lgbt_support_inequality,
    educ_gradient,
    #iso3c
  )

# merge ESS data
country_level_df <- country_level_df %>%
  left_join(ess_data_clean, by = c("iso2" = "cntry"))

country_level_df %>% View()

# delete the first Greece row
greece_rows <- which(country_level_df$country_name == "Greece")

if (length(greece_rows) > 1) {
  # remove the first instance of Greece
  country_level_df <- country_level_df[-greece_rows[1], ]
  
  # verify the fix worked
  greece_check <- country_level_df %>%
    filter(country_name == "Greece")
  print("Greece entries after removing the first instance:")
  print(greece_check)
}

# add the gender equality index
country_level_df <- country_level_df %>%
  left_join(gender_eq_index, by = "country_name")

## scaling: necessary for the regression analysis later on because the variables have vastly different scales
country_level_df %>% 
  select(where(is.numeric)) %>% 
  summary()
##    v2x_libdem     v2x_polyarchy      v2x_gender      v2x_egaldem    
##  Min.   :0.3680   Min.   :0.4720   Min.   :0.8410   Min.   :0.3340  
##  1st Qu.:0.7143   1st Qu.:0.8105   1st Qu.:0.8908   1st Qu.:0.6977  
##  Median :0.7855   Median :0.8595   Median :0.9220   Median :0.7550  
##  Mean   :0.7386   Mean   :0.8236   Mean   :0.9147   Mean   :0.7195  
##  3rd Qu.:0.8220   3rd Qu.:0.8808   3rd Qu.:0.9445   3rd Qu.:0.7917  
##  Max.   :0.8840   Max.   :0.9140   Max.   :0.9590   Max.   :0.8760  
##                                                                     
##   v2x_liberal       v2xcs_ccsi       v2x_freexp     Overall_score  
##  Min.   :0.7060   Min.   :0.5460   Min.   :0.6470   Min.   :6.380  
##  1st Qu.:0.8718   1st Qu.:0.8395   1st Qu.:0.9113   1st Qu.:7.357  
##  Median :0.9230   Median :0.9125   Median :0.9440   Median :7.790  
##  Mean   :0.8914   Mean   :0.8844   Mean   :0.9088   Mean   :7.886  
##  3rd Qu.:0.9380   3rd Qu.:0.9425   3rd Qu.:0.9702   3rd Qu.:8.568  
##  Max.   :0.9810   Max.   :0.9710   Max.   :0.9840   Max.   :9.390  
##                                                                    
##  Electoral_process_and_pluralism Functioning_of_government
##  Min.   : 8.750                  Min.   :5.360            
##  1st Qu.: 9.170                  1st Qu.:6.340            
##  Median : 9.580                  Median :7.320            
##  Mean   : 9.493                  Mean   :7.373            
##  3rd Qu.: 9.580                  3rd Qu.:8.300            
##  Max.   :10.000                  Max.   :9.640            
##                                                           
##  Political_participation Political_culture Civil_liberties     gdp_2005    
##  Min.   :5.000           Min.   : 4.380    Min.   : 7.060   Min.   : 9602  
##  1st Qu.:6.110           1st Qu.: 6.250    1st Qu.: 8.240   1st Qu.:16572  
##  Median :6.670           Median : 6.880    Median : 8.820   Median :26314  
##  Mean   :6.885           Mean   : 7.034    Mean   : 8.656   Mean   :26425  
##  3rd Qu.:8.330           3rd Qu.: 8.130    3rd Qu.: 9.120   3rd Qu.:32867  
##  Max.   :8.330           Max.   :10.000    Max.   :10.000   Max.   :68708  
##                                                                            
##     gdp_2018        gdp_growth     rainbow_score_2019 rainbow_score_2018
##  Min.   : 24052   Min.   : 19.15   Min.   :16.83      Min.   :16.07     
##  1st Qu.: 32335   1st Qu.: 53.44   1st Qu.:25.69      1st Qu.:28.92     
##  Median : 41545   Median : 68.22   Median :44.77      Median :51.40     
##  Mean   : 45761   Mean   : 83.19   Mean   :42.98      Mean   :49.39     
##  3rd Qu.: 52630   3rd Qu.:114.35   3rd Qu.:57.75      3rd Qu.:67.19     
##  Max.   :116334   Max.   :207.95   Max.   :74.72      Max.   :91.94     
##                                                                         
##  rainbow_score_avg_2019_2018 rainbow_score_avg_2013_2014
##  Min.   :16.45               Min.   :19.65              
##  1st Qu.:26.89               1st Qu.:29.31              
##  Median :48.34               Median :40.52              
##  Mean   :46.19               Mean   :43.86              
##  3rd Qu.:62.90               3rd Qu.:60.91              
##  Max.   :83.33               Max.   :79.38              
##                                                         
##  rainbow_score_difference Happiness_Score  Unemployment      Employment   
##  Min.   :-10.270          Min.   :4.933   Min.   : 2.800   Min.   :52.50  
##  1st Qu.: -6.414          1st Qu.:5.848   1st Qu.: 4.115   1st Qu.:64.97  
##  Median :  0.185          Median :6.173   Median : 5.650   Median :67.40  
##  Mean   :  2.325          Mean   :6.337   Mean   : 6.654   Mean   :67.17  
##  3rd Qu.:  5.274          3rd Qu.:6.938   3rd Qu.: 7.550   3rd Qu.:71.17  
##  Max.   : 37.280          Max.   :7.632   Max.   :19.180   Max.   :77.00  
##                           NA's   :1                                       
##  n_respondents     n_valid     lgbt_support_percent mean_religiosity
##  Min.   : 781   Min.   : 781   Min.   :30.85        Min.   :3.401   
##  1st Qu.:1529   1st Qu.:1529   1st Qu.:55.17        1st Qu.:4.490   
##  Median :1761   Median :1761   Median :75.34        Median :5.248   
##  Mean   :1769   Mean   :1769   Mean   :70.21        Mean   :5.343   
##  3rd Qu.:2200   3rd Qu.:2200   3rd Qu.:88.25        3rd Qu.:6.029   
##  Max.   :2745   Max.   :2745   Max.   :94.25        Max.   :7.513   
##  NA's   :4      NA's   :4      NA's   :4            NA's   :4       
##  mean_left_right  mean_equal_values mean_country_attach  mean_eduyrs   
##  Min.   :0.8134   Min.   :1.718     Min.   :6.974       Min.   :10.97  
##  1st Qu.:1.1858   1st Qu.:2.058     1st Qu.:7.977       1st Qu.:13.40  
##  Median :1.4188   Median :2.164     Median :8.116       Median :13.97  
##  Mean   :1.7429   Mean   :2.301     Mean   :8.177       Mean   :13.96  
##  3rd Qu.:2.1136   3rd Qu.:2.529     3rd Qu.:8.484       3rd Qu.:14.43  
##  Max.   :3.8986   Max.   :3.271     Max.   :9.341       Max.   :16.15  
##  NA's   :4        NA's   :4         NA's   :4           NA's   :4      
##     mean_age       pct_young     pct_high_educ   se_lgbt_support   
##  Min.   :45.46   Min.   :23.93   Min.   :26.32   Min.   :0.005644  
##  1st Qu.:48.48   1st Qu.:26.64   1st Qu.:36.40   1st Qu.:0.007032  
##  Median :49.44   Median :28.34   Median :41.12   Median :0.009516  
##  Mean   :52.31   Mean   :28.25   Mean   :39.98   Mean   :0.010061  
##  3rd Qu.:54.73   3rd Qu.:29.77   3rd Qu.:45.45   3rd Qu.:0.011830  
##  Max.   :68.17   Max.   :34.43   Max.   :52.72   Max.   :0.018152  
##  NA's   :4       NA's   :4       NA's   :4       NA's   :4         
##  pct_missing_lgbt age_lgbt_corr       relig_lgbt_corr   
##  Min.   :0        Min.   :-0.253892   Min.   :-0.27248  
##  1st Qu.:0        1st Qu.:-0.183525   1st Qu.:-0.19308  
##  Median :0        Median :-0.132761   Median :-0.14838  
##  Mean   :0        Mean   :-0.132479   Mean   :-0.14908  
##  3rd Qu.:0        3rd Qu.:-0.084937   3rd Qu.:-0.11943  
##  Max.   :0        Max.   : 0.009616   Max.   :-0.02086  
##  NA's   :4        NA's   :4           NA's   :4         
##  lgbt_support_inequality educ_gradient     gender_equality_index
##  Min.   :0.2294          Min.   :0.01615   Min.   :50.00        
##  1st Qu.:0.3139          1st Qu.:0.04848   1st Qu.:55.55        
##  Median :0.4196          Median :0.07695   Median :60.10        
##  Mean   :0.3966          Mean   :0.08256   Mean   :62.38        
##  3rd Qu.:0.4836          3rd Qu.:0.12288   3rd Qu.:69.25        
##  Max.   :0.5001          Max.   :0.16813   Max.   :82.60        
##  NA's   :4               NA's   :4         NA's   :1
# we'll apply different scaling methods based on the type of variable:
# 1. z-score standardization: for continuous variables to make them comparable (mean=0, sd=1)
# 2. min-max normalization: for variables with natural bounds (e.g., 0-100 scores)
# 3. log transformation: for heavily skewed variables like GDP

# custom function to standardize (z-score)
standardize <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

# custom function to min-max normalize
normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

# z-scores for the democracy scores
country_level_df <- country_level_df %>%
  mutate(
    z_v2x_libdem = standardize(v2x_libdem),
    z_v2x_polyarchy = standardize(v2x_polyarchy),
    z_v2x_gender = standardize(v2x_gender),
    z_v2x_egaldem = standardize(v2x_egaldem),
    z_v2x_liberal = standardize(v2x_liberal),
    z_v2xcs_ccsi = standardize(v2xcs_ccsi),
    z_v2x_freexp = standardize(v2x_freexp))

# z-scores for GDP and unemployment rate, natural log for GDP
country_level_df <- country_level_df %>%
  mutate(
    z_gdp_2018 = standardize(gdp_2018),
    log_gdp_2018 = log(gdp_2018),
    z_gdp_growth = standardize(gdp_growth),
    z_unemployment = standardize(Unemployment))

# z-scores and normalised rainbow variables 
country_level_df <- country_level_df %>%
  mutate(
    z_rainbow_score = standardize(rainbow_score_2019),
    norm_rainbow_score = normalize(rainbow_score_2019),
    z_lgbt_support = standardize(lgbt_support_percent),
    norm_lgbt_support = normalize(lgbt_support_percent))

# z-socres and normalised for happiness scores and gender equality
country_level_df <- country_level_df %>%
  mutate(
    z_happiness = standardize(Happiness_Score),
    z_gender_equality = standardize(gender_equality_index),
    norm_gender_equality = normalize(gender_equality_index))

# religion, politics, education and age
country_level_df <- country_level_df %>%
  mutate(
    z_religiosity = standardize(mean_religiosity),
    z_functioning_of_government = standardize(Functioning_of_government),
    z_left_right = standardize(mean_left_right),
    z_equal_values = standardize(mean_equal_values),
    z_country_attach = standardize(mean_country_attach),
    z_eduyrs = standardize(mean_eduyrs),
    z_age = standardize(mean_age))

# create two composite scores and regional classification
country_level_df <- country_level_df %>%
  mutate(
    composite_equality = rowMeans(
      cbind(z_gender_equality, z_rainbow_score, z_v2x_gender),
      na.rm = TRUE))

country_level_df <- country_level_df %>%
  mutate(
    composite_democracy = rowMeans(
      cbind(z_v2x_libdem, z_v2x_polyarchy, z_v2x_libdem, z_v2x_freexp), 
      na.rm = TRUE))

country_level_df <- country_level_df %>%
  mutate(
    region = case_when(
      country_name %in% c("Denmark", "Finland", "Sweden", "Estonia", "Latvia", "Lithuania") ~ 
        "Northern Europe",
      country_name %in% c("Belgium", "Netherlands", "Luxembourg", "Germany", "France", 
                          "Austria", "United Kingdom", "Ireland") ~ 
          "Western Europe",
       country_name %in% c("Portugal", "Spain", "Italy", "Malta", "Greece", "Cyprus") ~ 
        "Southern Europe",
      country_name %in% c("Poland", "Czech Republic", "Slovakia", "Hungary", "Slovenia", 
                          "Croatia", "Romania", "Bulgaria") ~           "Eastern Europe",
        TRUE ~ NA_character_))

# delete the "Country.x", "Country.y" and "CountryName" columns
country_level_df <- country_level_df %>%
  select(-c("Country.x", "Country.y", "CountryName"))

# save as RDS and csv
saveRDS(country_level_df, file = "country_level_df.rds")
write.csv(country_level_df, "country_level_df.csv")

Data cleaning

### data cleaning

library(tidyverse)
library(sjlabelled)
## 
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:forcats':
## 
##     as_factor
## The following object is masked from 'package:dplyr':
## 
##     as_label
## The following object is masked from 'package:ggplot2':
## 
##     as_label
library(haven)
## 
## Attaching package: 'haven'
## The following objects are masked from 'package:sjlabelled':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
library(mice)

data <- read_dta("data/raw/ZA7575.dta")

### 
# code the questions properly:
# 'DK' and refusals are especially a problem for questions with ordinal variables, 
# less so for non-ordinal variables; for reasons of completeness, we clean all of
# the 'DK' and refusals (and some other special cases) by converting them to NAs

data_correctly_coded <- data %>%
  mutate(
    # hard cases for me
    d72_1 = ifelse(d72_1 %in% c(5,6), NA, d72_1),
    d72_2 = ifelse(d72_2 %in% c(5,6), NA, d72_2),
    d60 = ifelse(d60 == 7, NA, d60),
    d25 = ifelse(d25 == 8, NA, d25),
    d8 = ifelse(d8 > 70, NA, d8), # subjective decision that no one can have more than 70 years of ed (even full-time professors)
    d7 = ifelse(d7 %in% c(15,97), NA, d7),
    d1 = ifelse(d1 %in% c(97,98), NA, d1),
    
    qa16_1 = ifelse(qa16_1 == 5, NA, qa16_1),
    qa16_2 = ifelse(qa16_2 == 5, NA, qa16_2),
    qa16_3 = ifelse(qa16_3 == 5, NA, qa16_3),
    qa16_4 = ifelse(qa16_4 == 5, NA, qa16_4),
    
    d71_1 = ifelse(d71_1 == 4, NA, d71_1),
    d71_2 = ifelse(d71_2 == 4, NA, d71_2),
    d71_3 = ifelse(d71_3 == 4, NA, d71_3),
    #qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
    #qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
    #qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
    #qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
    
    qb3_1 = ifelse(qb3_1 == 5, NA, qb3_1),
    qb3_2 = ifelse(qb3_2 == 5, NA, qb3_2),
    qb3_3 = ifelse(qb3_3 == 5, NA, qb3_3),
    qb3_4 = ifelse(qb3_4 == 5, NA, qb3_4),
    qb3_5 = ifelse(qb3_5 == 5, NA, qb3_5),
    qb3_6 = ifelse(qb3_6 == 5, NA, qb3_6),
    qb3_7 = ifelse(qb3_7 == 5, NA, qb3_7),
    
    qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
    qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
    qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
    qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
    qb4_5 = ifelse(qb4_5 == 5, NA, qb4_5),
    
    qb5_1 = ifelse(qb5_1 == 5, NA, qb5_1),
    qb5_2 = ifelse(qb5_2 == 5, NA, qb5_2),
    qb5_3 = ifelse(qb5_3 == 5, NA, qb5_3),
    qb5_4 = ifelse(qb5_4 == 5, NA, qb5_4),
    
    sd1_1 = ifelse(sd1_1 %in% c(3,4), NA, sd1_1),
    sd1_2 = ifelse(sd1_2 %in% c(3,4), NA, sd1_2),
    sd1_3 = ifelse(sd1_3 %in% c(3,4), NA, sd1_3),
    sd1_4 = ifelse(sd1_4 %in% c(3,4), NA, sd1_4),
    sd1_5 = ifelse(sd1_5 %in% c(3,4), NA, sd1_5),
    sd1_6 = ifelse(sd1_6 %in% c(3,4), NA, sd1_6),
    sd1_7 = ifelse(sd1_7 %in% c(3,4), NA, sd1_7),
    sd1_8 = ifelse(sd1_8 %in% c(3,4), NA, sd1_8),
    
    qc1_1 = ifelse(qc1_1 == 6, NA, qc1_1),
    qc1_2 = ifelse(qc1_2 == 6, NA, qc1_2),
    qc1_3 = ifelse(qc1_3 == 6, NA, qc1_3),
    qc1_4 = ifelse(qc1_4 == 6, NA, qc1_4),
    qc1_5 = ifelse(qc1_5 == 6, NA, qc1_5),
    qc1_6 = ifelse(qc1_6 == 6, NA, qc1_6),
    qc1_7 = ifelse(qc1_7 == 6, NA, qc1_7),
    qc1_8 = ifelse(qc1_8 == 6, NA, qc1_8),
    qc1_9 = ifelse(qc1_9 == 6, NA, qc1_9),
    qc1_10 = ifelse(qc1_10 == 6, NA, qc1_10),
    
    qc5_1 = ifelse(qc5_1 == 3, NA, qc5_1),
    qc5_2 = ifelse(qc5_2 == 3, NA, qc5_2),
    qc5_3 = ifelse(qc5_3 == 3, NA, qc5_3),
    qc5_4 = ifelse(qc5_4 == 3, NA, qc5_4),
    
    qc6_1 = ifelse(qc6_1 == 12, NA, qc6_1),
    qc6_2 = ifelse(qc6_2 == 12, NA, qc6_2),
    qc6_3 = ifelse(qc6_3 == 12, NA, qc6_3),
    qc6_4 = ifelse(qc6_4 == 12, NA, qc6_4),
    qc6_5 = ifelse(qc6_5 == 12, NA, qc6_5),
    qc6_6 = ifelse(qc6_6 == 12, NA, qc6_6),
    qc6_7 = ifelse(qc6_7 == 12, NA, qc6_7),
    qc6_8 = ifelse(qc6_8 == 12, NA, qc6_8),
    qc6_9 = ifelse(qc6_9 == 12, NA, qc6_9),
    qc6_10 = ifelse(qc6_10 == 12, NA, qc6_10),
    qc6_11 = ifelse(qc6_11 == 12, NA, qc6_11),
    
    qc9_1 = ifelse(qc9_1 == 7, NA, qc9_1),
    qc9_2 = ifelse(qc9_2 == 7, NA, qc9_2),
    qc9_3 = ifelse(qc9_3 == 7, NA, qc9_3),
    qc9_4 = ifelse(qc9_4 == 7, NA, qc9_4),
    qc9_5 = ifelse(qc9_5 == 7, NA, qc9_5),
    qc9_6 = ifelse(qc9_6 == 7, NA, qc9_6),
    qc9_7 = ifelse(qc9_7 == 7, NA, qc9_7),
    qc9_8 = ifelse(qc9_8 == 7, NA, qc9_8),
    qc9_9 = ifelse(qc9_9 == 7, NA, qc9_9),
    qc9_10 = ifelse(qc9_10 == 7, NA, qc9_10),
    qc9_11 = ifelse(qc9_11 == 7, NA, qc9_11),
    
    qc11_1 = ifelse(qc11_1 == 6, NA, qc11_1),
    qc11_2 = ifelse(qc11_2 == 6, NA, qc11_2),
    qc11_3 = ifelse(qc11_3 == 6, NA, qc11_3),
    qc11_4 = ifelse(qc11_4 == 6, NA, qc11_4),
    qc11_5 = ifelse(qc11_5 == 6, NA, qc11_5),
    qc11_6 = ifelse(qc11_6 == 6, NA, qc11_6),
    
    qc12_1 = ifelse(qc12_1 %in% c(11,12,13), NA, qc12_1),
    qc12_2 = ifelse(qc12_2 %in% c(11,12,13), NA, qc12_2),
    qc12_3 = ifelse(qc12_3 %in% c(11,12,13), NA, qc12_3),
    qc12_4 = ifelse(qc12_4 %in% c(11,12,13), NA, qc12_4),
    qc12_5 = ifelse(qc12_5 %in% c(11,12,13), NA, qc12_5),
    qc12_6 = ifelse(qc12_6 %in% c(11,12,13), NA, qc12_6),
    qc12_7 = ifelse(qc12_7 %in% c(11,12,13), NA, qc12_7),
    qc12_8 = ifelse(qc12_8 %in% c(11,12,13), NA, qc12_8),
    qc12_9 = ifelse(qc12_9 %in% c(11,12,13), NA, qc12_9),
    qc12_10 = ifelse(qc12_10 %in% c(11,12,13), NA, qc12_10),
    qc12_11 = ifelse(qc12_11 %in% c(11,12,13), NA, qc12_11),
    qc12_12 = ifelse(qc12_12 %in% c(11,12,13), NA, qc12_12),
    qc12_13 = ifelse(qc12_13 %in% c(11,12,13), NA, qc12_13),
    qc12_14 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_14),
    qc12_15 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_15),
    
    qc13_1 = ifelse(qc13_1 %in% c(11,12,13), NA, qc13_1),
    qc13_2 = ifelse(qc13_2 %in% c(11,12,13), NA, qc13_2),
    qc13_3 = ifelse(qc13_3 %in% c(11,12,13), NA, qc13_3),
    qc13_4 = ifelse(qc13_4 %in% c(11,12,13), NA, qc13_4),
    qc13_5 = ifelse(qc13_5 %in% c(11,12,13), NA, qc13_5),
    qc13_6 = ifelse(qc13_6 %in% c(11,12,13), NA, qc13_6),
    qc13_7 = ifelse(qc13_7 %in% c(11,12,13), NA, qc13_7),
    qc13_8 = ifelse(qc13_8 %in% c(11,12,13), NA, qc13_8),
    qc13_9 = ifelse(qc13_9 %in% c(11,12,13), NA, qc13_9),
    qc13_10 = ifelse(qc13_10 %in% c(11,12,13), NA, qc13_10),
    qc13_11 = ifelse(qc13_11 %in% c(11,12,13), NA, qc13_11),
    qc13_12 = ifelse(qc13_12 %in% c(11,12,13), NA, qc13_12),
    qc13_13 = ifelse(qc13_13 %in% c(11,12,13), NA, qc13_13),
    qc13_14 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_14),
    qc13_15 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_15),
    
    qc15_1 = ifelse(qc15_1 == 5, NA, qc15_1),
    qc15_2 = ifelse(qc15_2 == 5, NA, qc15_2),
    qc15_3 = ifelse(qc15_3 == 5, NA, qc15_3),
    
    qc16_1 = ifelse(qc16_1 == 5, NA, qc16_1),
    
    qc17_1 = ifelse(qc17_1 == 5, NA, qc17_1),
    qc17_2 = ifelse(qc17_2 == 5, NA, qc17_2),
    qc17_3 = ifelse(qc17_3 == 5, NA, qc17_3),
    qc17_4 = ifelse(qc17_4 == 5, NA, qc17_4),
    qc17_5 = ifelse(qc17_5 == 5, NA, qc17_5),
    qc17_6 = ifelse(qc17_6 == 5, NA, qc17_6),
    qc17_7 = ifelse(qc17_7 == 5, NA, qc17_7),
    
    qc18_1 = ifelse(qc18_1 %in% c(11,12), NA, qc18_1),
    qc18_2 = ifelse(qc18_2 %in% c(11,12), NA, qc18_2),
    qc18_3 = ifelse(qc18_3 %in% c(11,12), NA, qc18_3),
    
    sd3 = ifelse(sd3 %in% c(15,16), NA, sd3),
    
    # easy cases for Claude
    #q1 = ifelse(q1 %in% c(29,30), NA, q1),
    
    qa1 = ifelse(qa1 == 5, NA, qa1),
    
    qa7 = ifelse(qa7 == 5, NA, qa7),
    
    qa8 = ifelse(qa8 == 4, NA, qa8),
    
    qa9 = ifelse(qa9 == 6, NA, qa9),
    
    qa14 = ifelse(qa14 == 6, NA, qa14),
    
    qa17 = ifelse(qa17 == 5, NA, qa17),
    
    qb6 = ifelse(qb6 == 4, NA, qb6),
    
    qb7 = ifelse(qb7 == 5, NA, qb7),
    
    qc19 = ifelse(qc19 == 3, NA, qc19),
    
    qc20 = ifelse(qc20 == 3, NA, qc20),
    
    d63 = ifelse(d63 %in% c(6,7,8,9), NA, d63), # manually because stupid Claude
    
    d77 = ifelse(d77 == 5, NA, d77),
    
    d70 = ifelse(d70 == 5, NA, d70),
    
    qa4a = ifelse(qa4a %in% c(7,8), NA, qa4a),
    
    qa5a = ifelse(qa5a %in% c(11,12,13), NA, qa5a),
    
    qa11 = ifelse(qa11 %in% c(4,5), NA, qa11),
    
    qa12 = ifelse(qa12 %in% c(5,6), NA, qa12),
    
    qa13 = ifelse(qa13 %in% c(6,7), NA, qa13),
    
    qa18a = ifelse(qa18a %in% c(7,8,9), NA, qa18a),
    
    qb8 = ifelse(qb8 == 5, NA, qb8),
    
    sd3 = ifelse(sd3 == 16, NA, sd3),
    
    qc3 = ifelse(qc3 %in% c(10,11), NA, qc3),
    
    qc7 = ifelse(qc7 %in% c(11,12), NA, qc7),
    
    qc8 = ifelse(qc8 %in% c(11,12), NA, qc8),
    
    qc10 = ifelse(qc10 %in% c(9,10), NA, qc10))

# write_rds(data_correctly_coded, file = "data_correctly_coded.rds")


###
# get rid of all the "r" coded ones
# for the questions with underscores, (1) if they are on a scale, e.g., 1-3 and 'DK'
# is one of them, replace 'DK'; if it's 0-1 coding, keep that (can't do anything about it)

data_correctly_coded <- data_correctly_coded %>%
  select(-matches("_r$|_r[0-9]$|_r[0-9]_")) 
# use setdiff() to check whether it's done properly 


### 
# now analyse the patterns of NAs across columns
missing_data_before <- data_correctly_coded %>%
  summarise(across(everything(), ~sum(is.na(.))/n())) %>%
  pivot_longer(cols = everything(), 
               names_to = "variable", 
               values_to = "missing_proportion") %>%
  arrange(desc(missing_proportion))

# view top variables with missing data
print(head(missing_data_before, 20))
## # A tibble: 20 × 2
##    variable missing_proportion
##    <chr>                 <dbl>
##  1 p6mt                  0.982
##  2 p13mt                 0.982
##  3 p6cy                  0.982
##  4 p7cy                  0.982
##  5 p6lu                  0.981
##  6 p7lu                  0.981
##  7 p13lu                 0.981
##  8 p6hr                  0.964
##  9 p7hr                  0.964
## 10 p6ee                  0.963
## 11 p6fi                  0.963
## 12 p6lt                  0.963
## 13 p7ee                  0.963
## 14 p7fi                  0.963
## 15 p7lt                  0.963
## 16 p13ee                 0.963
## 17 p13fi                 0.963
## 18 p6dk                  0.963
## 19 p7dk                  0.963
## 20 p6es                  0.963
# viz
ggplot(missing_data_before %>% filter(missing_proportion > 0.25), 
       aes(x = reorder(variable, missing_proportion), y = missing_proportion)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variables with >25% Missing Data",
       x = "Variable",
       y = "Proportion Missing") +
  theme_minimal()

# calculate overall proportion of missing data
mean(missing_data_before$missing_proportion)
## [1] 0.1424237
# deeper analysis
missing_by_prefix <- missing_data_before %>%
  mutate(prefix = str_extract(variable, "^[a-z]+\\d+")) %>%
  group_by(prefix) %>%
  summarise(
    avg_missing = mean(missing_proportion),
    n_vars = n(),
    max_missing = max(missing_proportion),
    min_missing = min(missing_proportion)
  ) %>%
  arrange(desc(avg_missing))

print(head(missing_by_prefix, 15))
## # A tibble: 15 × 5
##    prefix avg_missing n_vars max_missing min_missing
##    <chr>        <dbl>  <int>       <dbl>       <dbl>
##  1 p13          0.945      8       0.982      0.779 
##  2 p6           0.931     29       0.982      0     
##  3 p7           0.930     28       0.982      0.0180
##  4 qc3          0.861      1       0.861      0.861 
##  5 qa3          0.679      7       0.679      0.679 
##  6 qa15         0.627      5       0.627      0.627 
##  7 qa2          0.376      8       0.376      0.376 
##  8 d15          0.261      2       0.522      0     
##  9 qb7          0.208      1       0.208      0.208 
## 10 d40          0.157      5       0.784      0     
## 11 qa13         0.124      1       0.124      0.124 
## 12 qc19         0.120      1       0.120      0.120 
## 13 qc16         0.117      1       0.117      0.117 
## 14 qc20         0.116      1       0.116      0.116 
## 15 qc10         0.109      1       0.109      0.109
# we take out the following variables because (1) they exhibited high missingness
# while at the same aren't super releveant (that's an assumption we make) for our
# subsequent analysis
data_correctly_coded <- data_correctly_coded %>%
  select(
    # Exclude variables with specific prefixes
    -starts_with("p6"),   # Size of locality
    -starts_with("p7"),   # Region
    -starts_with("p13"),  # Language of interview
    -starts_with("d40"),  # Household size
    -starts_with("qa3"),  # Not benefitting from trade
    -starts_with("qa15"), # Countries bought goods from
    -starts_with("qa2"),  # Benefitting from trade
    -starts_with("qb8"),  # EU energy label influence
    -starts_with("qa18b"), # Information sources follow-up
    -starts_with("qc3")   # Discrimination circumstances
  )

# check how much we imporved the missingness rate
missing_data_after <- data_correctly_coded %>%
  summarise(across(everything(), ~sum(is.na(.))/n())) %>%
  pivot_longer(cols = everything(), 
               names_to = "variable", 
               values_to = "missing_proportion") %>%
  arrange(desc(missing_proportion))

mean(missing_data_after$missing_proportion)
## [1] 0.02018962
# prepare for imputation
data_correctly_coded <-  data_correctly_coded %>%
  mutate(isocntry = as.factor(isocntry)) %>%
  # remove other problematic variables (like IDs) if needed
  select(!any_of(c("doi", "version", "caseid", "uniqid", "serialid"))) %>%
  sjlabelled::remove_all_labels()

# delete variables with zero variance as they cannot ever be useful predictors
var_zero <- data_correctly_coded %>% 
  select(where(is.numeric)) %>%
  summarise(across(everything(), var, na.rm = TRUE)) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "variance") %>%
  filter(variance == 0 | is.na(variance))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(everything(), var, na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
if(nrow(var_zero) > 0) {
  cat("Removing", nrow(var_zero), "variables with zero variance\n")
  data_correctly_coded <- data_correctly_coded %>%
    select(!any_of(var_zero$variable))
}
## Removing 6 variables with zero variance
data_correctly_coded <- data_correctly_coded %>%
  # convert country codes: collapse East and West Germany into one DE
  mutate(isocntry = case_when(
    isocntry %in% c("DE-W", "DE-E") ~ "DE",
    TRUE ~ isocntry))

###
# run the imputation
set.seed(1212)

start_time <- Sys.time()
#imp_model <- mice(data_correctly_coded, m = 3, method = 'rf', maxit = 3)
end_time <- Sys.time()
end_time - start_time
## Time difference of 0.0007889271 secs
#df_rf_new <- complete(imp_model, action = 3)

#write_rds(df_rf_new, "df_rf_new.rds")

Data cleaning: external datasets

## why still z-scores for mean religosity, left-right  etc. 

country_level_df <- readRDS("country_level_df.rds")

# first, let's check which variables have missing values
missing_summary <- colSums(is.na(country_level_df))
missing_summary <- missing_summary[missing_summary > 0]
print(missing_summary)
##         Happiness_Score           n_respondents                 n_valid 
##                       1                       4                       4 
##    lgbt_support_percent        mean_religiosity         mean_left_right 
##                       4                       4                       4 
##       mean_equal_values     mean_country_attach             mean_eduyrs 
##                       4                       4                       4 
##                mean_age               pct_young           pct_high_educ 
##                       4                       4                       4 
##         se_lgbt_support        pct_missing_lgbt           age_lgbt_corr 
##                       4                       4                       4 
##         relig_lgbt_corr lgbt_support_inequality           educ_gradient 
##                       4                       4                       4 
##   gender_equality_index          z_lgbt_support       norm_lgbt_support 
##                       1                       4                       4 
##             z_happiness       z_gender_equality    norm_gender_equality 
##                       1                       1                       1 
##           z_religiosity            z_left_right          z_equal_values 
##                       4                       4                       4 
##        z_country_attach                z_eduyrs                   z_age 
##                       4                       4                       4
# Visualize missing data patterns
vis_miss(country_level_df)

# we have the following imputation strategy:
# (1) we will have to delete the variables from the ESS because the Eurobarometer
# survey contains similar information and imputing these values didn't seem appropriate
# (2) we chose to impute the happiness score using KNN

# (1) 
# delete ESS variables
ESS_variables <- c(
  # original survey variables
  "n_respondents", "n_valid", "lgbt_support_percent", "mean_religiosity", 
  "mean_left_right", "mean_equal_values", "mean_country_attach", 
  "mean_eduyrs", "mean_age", "pct_young", "pct_high_educ", "se_lgbt_support",
  
  # additional ESS metrics
  "pct_missing_lgbt", "age_lgbt_corr", "relig_lgbt_corr", 
  "lgbt_support_inequality", "educ_gradient",
  
  # z-score transformations of ESS variables
  "z_religiosity", "z_left_right", "z_equal_values", 
  "z_country_attach", "z_eduyrs", "z_age")

country_level_df <- country_level_df %>%
  select(-all_of(ESS_variables))

# (2)
# create separate datasets for variables that need imputation
missing_variables <- names(missing_summary)[missing_summary < 5 & 
                                             !(names(missing_summary) %in% ESS_variables)]

# for our KNN imputation, we need to identify variables to use as predictors
# these should be variables without NAs that also correlate with those we want to impute

# identify complete variables that could serve as predictors
complete_variables <- names(country_level_df)[colSums(is.na(country_level_df)) == 0]
complete_variables <- complete_variables[!complete_variables %in% c("Unnamed: 0", "country_name", "iso2", "region")]

print(complete_variables)
##  [1] "v2x_libdem"                      "v2x_polyarchy"                  
##  [3] "v2x_gender"                      "v2x_egaldem"                    
##  [5] "v2x_liberal"                     "v2xcs_ccsi"                     
##  [7] "v2x_freexp"                      "Overall_score"                  
##  [9] "Electoral_process_and_pluralism" "Functioning_of_government"      
## [11] "Political_participation"         "Political_culture"              
## [13] "Civil_liberties"                 "Regime_type"                    
## [15] "gdp_2005"                        "gdp_2018"                       
## [17] "gdp_growth"                      "rainbow_score_2019"             
## [19] "rainbow_score_2018"              "rainbow_score_avg_2019_2018"    
## [21] "rainbow_score_avg_2013_2014"     "rainbow_score_difference"       
## [23] "Unemployment"                    "Employment"                     
## [25] "z_v2x_libdem"                    "z_v2x_polyarchy"                
## [27] "z_v2x_gender"                    "z_v2x_egaldem"                  
## [29] "z_v2x_liberal"                   "z_v2xcs_ccsi"                   
## [31] "z_v2x_freexp"                    "z_gdp_2018"                     
## [33] "log_gdp_2018"                    "z_gdp_growth"                   
## [35] "z_unemployment"                  "z_rainbow_score"                
## [37] "norm_rainbow_score"              "z_functioning_of_government"    
## [39] "composite_equality"              "composite_democracy"
# custom function to impute variables with KNN, handling non-numeric data
knn_impute <- function(data, vars_to_impute, predictor_vars, k = 5) {
  # create a copy of the data
  imputed_data <- data
  
  # ensure predictor variables are numeric and complete
  pred_data <- data[, predictor_vars, drop = FALSE]
  pred_data <- as.data.frame(lapply(pred_data, function(x) as.numeric(x)))
  
  # remove any non-numeric columns
  numeric_cols <- sapply(pred_data, is.numeric)
  if(any(!numeric_cols)) {
    warning("Removing non-numeric predictor columns: ", 
            paste(names(pred_data)[!numeric_cols], collapse=", "))
    pred_data <- pred_data[, numeric_cols, drop=FALSE]
    predictor_vars <- predictor_vars[numeric_cols]
  }
  
  # handle missing values in predictor variables by using column means
  for(col in names(pred_data)) {
    if(any(is.na(pred_data[[col]]))) {
      pred_data[[col]][is.na(pred_data[[col]])] <- mean(pred_data[[col]], na.rm=TRUE)
    }
  }
  
  # standardize the predictor variables manually to avoid issues
  pred_data_std <- as.data.frame(scale(pred_data))
  
  # for each variable to impute:
  for (var in vars_to_impute) {
    if (var %in% names(data) && sum(is.na(data[[var]])) > 0) {
      # ensure the target variable is numeric
      if (!is.numeric(data[[var]])) {
        cat("Skipping", var, "as it is not numeric\n")
        next
      }
      
      # identify cases with missing values
      missing_indices <- which(is.na(data[[var]]))
      
      # for each missing value:
      for (idx in missing_indices) {
        # calculate Euclidean distances to all other countries
        distances <- numeric(nrow(data))
        
        for (i in 1:nrow(data)) {
          if (i != idx) {
            # calculate squared differences for each predictor
            squared_diffs <- sapply(names(pred_data_std), function(p) {
              (pred_data_std[idx, p] - pred_data_std[i, p])^2
            })
            
            # sum and take square root for Euclidean distance
            distances[i] <- sqrt(sum(squared_diffs, na.rm=TRUE))
          } else {
            distances[i] <- Inf  # don't use the country itself
          }
        }
        
        # find k nearest neighbors with non-missing values for this variable
        valid_neighbors <- which(!is.na(data[[var]]) & !is.infinite(distances))
        
        if (length(valid_neighbors) > 0) {
          # order the neighbors by distance
          neighbor_order <- order(distances[valid_neighbors])
          valid_neighbors <- valid_neighbors[neighbor_order]
          
          # take the k nearest, or as many as available if fewer than k
          k_actual <- min(k, length(valid_neighbors))
          nearest_k <- valid_neighbors[1:k_actual]
          
          # impute as the average of the k nearest neighbors
          imputed_data[idx, var] <- mean(data[nearest_k, var], na.rm = TRUE)
          
          cat("Imputed", var, "for", data$country_name[idx], 
              "using neighbors:", paste(data$country_name[nearest_k], collapse=", "), "\n")
        } else {
          cat("Warning: Could not impute", var, "for", data$country_name[idx], 
              "- no valid neighbors found\n")
        }
      }
    }
  }
  
  return(imputed_data)
}

# now use the function to impute variables with missing values
# first ensure we have the right variable types
country_df_numeric <- country_level_df

for (var in missing_variables) {
  if (var %in% names(country_level_df) && !is.numeric(country_level_df[[var]])) {
    country_df_numeric[[var]] <- as.numeric(as.character(country_level_df[[var]]))
  }
}

# run the imputation with proper error handling
tryCatch({
  country_df_imputed <- knn_impute(
    data = country_df_numeric,
    vars_to_impute = missing_variables,
    predictor_vars = complete_variables,
    k = 3  # using 3 nearest neighbors
  )
  print("Imputation completed successfully!")
}, error = function(e) {
  # if the knn_impute function still fails, let's try an alternative approach
  print(paste("KNN imputation error:", e$message))
  print("Switching to a simpler mean imputation approach...")
  
  country_df_imputed <- country_df_numeric
  for (var in missing_variables) {
    if (sum(is.na(country_df_imputed[[var]])) > 0) {
      # Calculate mean of non-missing values
      var_mean <- mean(country_df_imputed[[var]], na.rm = TRUE)
      # Replace missing values with mean
      country_df_imputed[[var]][is.na(country_df_imputed[[var]])] <- var_mean
      print(paste("Imputed", var, "with mean value:", var_mean))
    }
  }
  return(country_df_imputed)
})
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Imputed Happiness_Score for Belgium using neighbors: France, Portugal, Denmark 
## Imputed gender_equality_index for United Kingdom using neighbors: Netherlands, Austria, France 
## Imputed z_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany 
## Imputed z_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy 
## Imputed z_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria 
## Imputed z_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia 
## Imputed norm_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany 
## Imputed norm_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy 
## Imputed norm_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria 
## Imputed norm_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia 
## Imputed z_happiness for Belgium using neighbors: France, Portugal, Denmark 
## Imputed z_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France 
## Imputed norm_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France 
## [1] "Imputation completed successfully!"
# save
saveRDS(country_df_imputed, file = "country_df_imputed.rds") 
write.csv(country_df_imputed, "country_df_imputed.csv", row.names = FALSE)

Country aggregates

# libraries
library(dplyr)
library(tidyr)
library(readr)

# load the data
df_rf_new <- read_csv("df_rf_new.csv") # the imputed dataset
## New names:
## Rows: 27438 Columns: 475
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): isocntry, nuts dbl (473): ...1, tnscntry, country, d11, d11r1, d11r2,
## d11r3, gen1, gen2, ge...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
country_df_imputed <- readRDS("country_df_imputed.rds") # external dataset

# custom function to calculate country-level aggregates
calculate_country_aggregates <- function(df) {
  country_agg <- df %>%
    group_by(isocntry) %>%
    summarize(
      
      iso2 = first(isocntry),
      
      # demographics
      mean_age = mean(d11, na.rm = TRUE),
      median_age = median(d11, na.rm = TRUE),
      
      # life satisfaction (recoded)
      mean_life_satisfaction = mean(5 - d70, na.rm = TRUE), 
      pct_satisfied = mean(d70 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # political discussions
      mean_natl_political_discuss = mean(d71_1, na.rm = TRUE),
      mean_eu_political_discuss = mean(d71_2, na.rm = TRUE),
      mean_local_political_discuss = mean(d71_3, na.rm = TRUE),
      pct_discuss_natl_politics = mean(d71_1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_discuss_eu_politics = mean(d71_2 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_discuss_local_politics = mean(d71_3 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # trade and globalization (recoded)
      mean_trade_support = mean(5 - qa1, na.rm = TRUE),
      pct_trade_support = mean(qa1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_pro_globalization = mean(qa5a %in% c(1, 2, 3, 5, 7, 8), na.rm = TRUE) * 100,
      pct_anti_globalization = mean(qa5a %in% c(4, 6, 9, 10), na.rm = TRUE) * 100,
      mean_eu_trade_support = mean(5 - qa7, na.rm = TRUE),
      pct_support_eu_trade = mean(qa7 %in% c(1, 2), na.rm = TRUE) * 100,
      mean_esg_support = mean(5 - qa9, na.rm = TRUE),
      pct_support_esg = mean(qa9 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # political and social indicators
      mean_left_right = mean(d1, na.rm = TRUE),
      pct_left = mean(d1 %in% c(1, 2, 3, 4), na.rm = TRUE) * 100,
      pct_center = mean(d1 %in% c(5, 6), na.rm = TRUE) * 100,
      pct_right = mean(d1 %in% c(7, 8, 9, 10), na.rm = TRUE) * 100,
      mean_education_years = mean(d8, na.rm = TRUE),
      median_education_years = median(d8, na.rm = TRUE),
      pct_high_education = mean(d8 >= 20, na.rm = TRUE) * 100,
      pct_rural = mean(d25 == 1, na.rm = TRUE) * 100,
      pct_urban = mean(d25 %in% c(2, 3), na.rm = TRUE) * 100,
      pct_large_urban = mean(d25 == 3, na.rm = TRUE) * 100,
      mean_financial_difficulty = mean(4 - d60, na.rm = TRUE),
      pct_financial_difficulty = mean(d60 %in% c(1, 2), na.rm = TRUE) * 100,
      mean_subjective_class = mean(d63, na.rm = TRUE),
      pct_working_class = mean(d63 == 1, na.rm = TRUE) * 100,
      pct_middle_class = mean(d63 %in% c(2, 3, 4), na.rm = TRUE) * 100,
      pct_upper_class = mean(d63 == 5, na.rm = TRUE) * 100,
      mean_voice_in_eu = mean(5 - d72_1, na.rm = TRUE),
      mean_voice_in_country = mean(5 - d72_2, na.rm = TRUE),
      pct_voice_in_eu = mean(d72_1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_voice_in_country = mean(d72_2 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # diversity indicators
      pct_friends_diff_ethnic = mean(sd1_1 == 1, na.rm = TRUE) * 100,
      pct_friends_diff_skin = mean(sd1_2 == 1, na.rm = TRUE) * 100,
      pct_friends_roma = mean(sd1_3 == 1, na.rm = TRUE) * 100,
      pct_friends_lgbt = mean(sd1_4 == 1, na.rm = TRUE) * 100,
      pct_friends_disabled = mean(sd1_5 == 1, na.rm = TRUE) * 100,
      pct_friends_diff_religion = mean(sd1_6 == 1, na.rm = TRUE) * 100,
      pct_friends_transgender = mean(sd1_7 == 1, na.rm = TRUE) * 100,
      pct_friends_intersex = mean(sd1_8 == 1, na.rm = TRUE) * 100,
      pct_ethnic_minority = mean(sd2_1 == 1, na.rm = TRUE) * 100,
      pct_skin_minority = mean(sd2_2 == 1, na.rm = TRUE) * 100,
      pct_religious_minority = mean(sd2_3 == 1, na.rm = TRUE) * 100,
      pct_roma = mean(sd2_4 == 1, na.rm = TRUE) * 100,
      pct_sexual_minority = mean(sd2_5 == 1, na.rm = TRUE) * 100,
      pct_disability = mean(sd2_6 == 1, na.rm = TRUE) * 100,
      pct_other_minority = mean(sd2_7 == 1, na.rm = TRUE) * 100,
      pct_any_minority = mean(sd2_1 == 1 | sd2_2 == 1 | sd2_3 == 1 | sd2_4 == 1 | 
                                sd2_5 == 1 | sd2_6 == 1 | sd2_7 == 1, na.rm = TRUE) * 100,
      
      # religion
      pct_catholic = mean(sd3 == 1, na.rm = TRUE) * 100,
      pct_orthodox = mean(sd3 == 2, na.rm = TRUE) * 100,
      pct_protestant = mean(sd3 == 3, na.rm = TRUE) * 100,
      pct_other_christian = mean(sd3 == 4, na.rm = TRUE) * 100,
      pct_jewish = mean(sd3 == 5, na.rm = TRUE) * 100,
      pct_muslim = mean(sd3 %in% c(6, 7, 8), na.rm = TRUE) * 100,
      pct_atheist = mean(sd3 == 13, na.rm = TRUE) * 100,
      pct_nonbeliever = mean(sd3 == 14, na.rm = TRUE) * 100,
      pct_nonreligious = mean(sd3 %in% c(13, 14), na.rm = TRUE) * 100,
      
      # subjective discrimination perception (recoded)
      mean_discrim_ethnic = mean(5 - qc1_1, na.rm = TRUE),
      mean_discrim_skin = mean(5 - qc1_2, na.rm = TRUE),
      mean_discrim_roma = mean(5 - qc1_3, na.rm = TRUE),
      mean_discrim_lgbt = mean(5 - qc1_4, na.rm = TRUE),
      mean_discrim_age = mean(5 - qc1_5, na.rm = TRUE),
      mean_discrim_religion = mean(5 - qc1_6, na.rm = TRUE),
      mean_discrim_disability = mean(5 - qc1_7, na.rm = TRUE),
      mean_discrim_transgender = mean(5 - qc1_8, na.rm = TRUE),
      mean_discrim_gender = mean(5 - qc1_9, na.rm = TRUE),
      mean_discrim_intersex = mean(5 - qc1_10, na.rm = TRUE),
      
    )
  return(country_agg)
}

# apply the function
country_aggregates <- calculate_country_aggregates(df_rf_new)

# join 
country_data_combined <- country_df_imputed %>%
  left_join(country_aggregates, by = "iso2")

# while it's less relevant for a ML model such as random forest, for a multi-level 
# regression model, it's important to standardize the values to allow for better
# coefficient interpretation; the goal is to have both z-scores and 0-1 normalised
# values for each of the original variables (except for variables like 'region')

country_data_combined <- country_data_combined %>%
  # first handle all numeric variables that aren't already standardized
  mutate(
    # z-score standardization
    across(where(is.numeric) & 
             !starts_with("z_") & 
             !starts_with("norm_") &
             !matches("country|iso"), 
           list(z = ~scale(.)[,1]),
           .names = "z_{.col}"),
    
    # also create 0-1 normalized versions
    across(where(is.numeric) & 
             !starts_with("z_") & 
             !starts_with("norm_") &
             !matches("country|iso"), 
           list(norm = ~(.-min(., na.rm=TRUE))/(max(., na.rm=TRUE)-min(., na.rm=TRUE))),
           .names = "norm_{.col}"))

# save the combined dataset
saveRDS(country_data_combined, file = "country_data_combined.rds")
write_csv(country_data_combined, "country_data_combined.csv")

# join with df_rf; the actual, imputed survey data
df_rf_enriched_new <- df_rf_new %>%
  # ISO codes as key
  left_join(country_data_combined, by = c("isocntry" = "iso2"))

# save final dataset
write_csv(df_rf_enriched_new, "df_rf_enriched_new.csv")

Descriptive analysis 1

# load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(viridis)
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
## 
##     viridis_pal
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
country_level_df <- readRDS("country_level_df.rds")

# plot 1: employment and unemployment Rates
p_1 <- ggplot(country_level_df, aes(x = reorder(country_name, Employment))) +
  geom_bar(aes(y = Employment), stat = "identity", fill = "steelblue", alpha = 0.7) +
  geom_point(aes(y = Unemployment), color = "darkred", size = 3) +
  scale_y_continuous(
    name = "Employment Rate (%)",
    limits = c(0, 100),
    sec.axis = sec_axis(~., name = "Unemployment Rate (%)")
  ) +
  labs(title = "Employment and Unemployment Rates by Country (2018)",
       subtitle = "Bars: Employment rate | Points: Unemployment rate",
       x = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.y.left = element_text(color = "steelblue"),
        axis.title.y.right = element_text(color = "darkred"))

# plot 2: GDP per capita and GDP per capita growth rate
p_2 <- ggplot(country_level_df, aes(x = reorder(country_name, gdp_2018))) +
  geom_bar(aes(y = gdp_2018), stat = "identity", fill = "steelblue", alpha = 0.7) +
  geom_point(aes(y = gdp_growth * 500), color = "darkred", size = 3) +  # Scale growth to fit on same axis
  scale_y_continuous(
    name = "GDP per Capita (USD)",
    labels = comma,
    sec.axis = sec_axis(~./500, name = "GDP Growth 2005-2018 (%)")
  ) +
  labs(title = "GDP per Capita (2018) and Growth Rate",
       subtitle = "Bars: GDP per capita | Points: Growth rate",
       x = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.y.left = element_text(color = "steelblue"),
        axis.title.y.right = element_text(color = "darkred"))

# plot 3: relationship between employment and LGBT support
p_3 <- ggplot(country_level_df, aes(x = Employment, y = lgbt_support_percent)) +
  geom_point(aes(color = Unemployment, size = rainbow_score_2019)) +
  geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
  scale_color_viridis_c(option = "B", name = "Unemployment\nRate (%)") +
  scale_size_continuous(name = "Rainbow\nScore 2019") +
  labs(title = "Relationship between Employment Rates and LGBT Support",
       subtitle = "Color indicates unemployment rate, size shows level of LGBT legal protection",
       x = "Employment Rate (%)",
       y = "LGBT Support (%)") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())


# plot 4: relationship between GDP per capita to LGBT support
p_4 <- ggplot(country_level_df, aes(x = gdp_2018, y = lgbt_support_percent)) +
  geom_point(aes(color = rainbow_score_2019, size = gdp_growth)) +
  geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
  scale_color_viridis_c(option = "E", name = "Rainbow\nScore 2019") +
  scale_size_continuous(name = "GDP Growth\n2005-2018 (%)") +
  scale_x_continuous(labels = comma) +
  labs(title = "Relationship between GDP, LGBT Support and Rights Protections",
       subtitle = "Size indicates GDP growth rate from 2005-2018",
       x = "GDP per Capita (2018, USD)",
       y = "LGBT Support (%)") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())

# display the plots
p_1

p_2

p_3
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).

p_4
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).

Reduced dataset

library(dplyr)
library(readr)
library(kableExtra)

#df_rf_enriched_new <- readRDS("df_rf_enriched_new.rds")

df_reduced <- df_rf_enriched_new %>%
  select(
    # target
    qc19,
    
    # country identifier
    country, country_name, isocntry,
    
    # individual demographics
    age = d11,           
    gender = d10,            
    education = d8, d8r2,      
    occupation = d15a,           
    urban_rural = d25,            
    financial_insecurity = d60,            
    social_class = d63,         
    religion = sd3,            
    political_ideology = d1, 
    lgb_friends = sd1_4,
    trans_friends = sd1_7,
    
    # Personal identity
    ethnic_minority = sd2_1,
    skin_color_minority = sd2_2,
    religious_minority = sd2_3,
    sexual_lgbt_minority = sd2_5,
    disability_minority = sd2_6,
    other_minority = sd2_7,
    none_minority = sd2_8,
    
    # Discrimination
    trans_discrimination_country = qc1_8,
    trans_discrimination_personal = qc2_6,
    trans_discrimination_workplace = qc4_8,
    trans_discrimination_political = qc6_10,
    country_discrimination_efforts = qc7,
    country_discrimination_efforts_recoded = qc7r,
    trans_workplace_diversity = qc9_10,
    trans_colleague = qc12_11,
    trans_colleague_recoded = qc12_11r,
    trans_child_relationship = qc13_11,
    trans_child_relationship_recoded = qc13_11r,
    lgb_rights = qc15_1, 
    same_sex_relationship = qc15_2, 
    same_sex_marriage = qc15_3,
    lgb_school_materials = qc17_3,
    trans_school_materials = qc17_4,
    intersex_school_materials = qc17_5,
    two_men_public_affection = qc18_2,
    two_men_public_affection_recoded = qc18_2r,
    two_women_public_affection = qc18_3,
    two_women_public_affection_recoded = qc18_3r,
    non_gendered_docs = qc20,
    
    # Country-level variables
    gdp_2018,                # Economic development
    rainbow_score_2019,      # LGBTI rights/protections
    gender_equality_index,   # Gender equality
    Happiness_Score,         # National well-being
    gdp_2018,                # Economic development
    rainbow_score_2019,      # LGBTI rights/protections
    rainbow_score_2018,      # LGBTI rights/protections (previous year)
    rainbow_score_avg_2019_2018, # Average LGBTI rights/protections
    gender_equality_index,   # Gender equality
    Happiness_Score,         # National well-being
    v2x_libdem,              # Democratic quality
    v2x_egaldem,             # Democratic quality
    Regime_type,             # Democratic quality
    z_functioning_of_government, #Govenment function measure
    composite_equality,     # Equality measure
    z_composite_equality,   # Standardized equality measure
    norm_composite_equality, # Normalized equality measure
    z_lgbt_support,         # Standardized LGBT support
    norm_lgbt_support,      # Normalized LGBT support
    pct_friends_lgbt,       # Percentage of LGBT friends
    z_pct_friends_lgbt,     # Standardized % of LGBT friends
    norm_pct_friends_lgbt,  # Normalized % of LGBT friends
    mean_left_right,         # Average political leaning
    pct_high_education,      # Education level
    region,                  # Geographic region
    
    # Paradata
    interview_date = p1,                      
    interview_start_time = p2,                      
    interview_duration = p3,                      
    interview_duration_recoded = p3r,                     
    people_present_during_interview = p4,                      
    respondent_cooperation = p5                       
  )

# Save to a new file
write_rds(df_reduced, "df_reduced.rds")

Descriptive analysis 2

# packages
library(tidyverse)
library(corrplot)
## corrplot 0.95 loaded
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggrepel)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
library(DALEX) 
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## 
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
## 
##     explain
library(ranger) 
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
# load datasets
df_reduced <- readRDS("df_reduced.rds")
country_data_combined <- readRDS("country_data_combined.rds")

# first calculate country aggregates for individual variables so that we have
# continuous data for which we can more easily calculate correlations

df_reduced_aggregated <- df_reduced %>%
  group_by(country_name, isocntry) %>%
  summarize(
    
    # target
    pct_support_trans = mean(qc19 == 1, na.rm = TRUE) * 100,
    pct_oppose_trans = mean(qc19 == 2, na.rm = TRUE) * 100,
    pct_dk_trans = mean(qc19 == 3, na.rm = TRUE) * 100,
    
    # demographics
    mean_age = mean(age, na.rm = TRUE),
    pct_female = mean(gender == 2, na.rm = TRUE) * 100,
    mean_education_years = mean(education, na.rm = TRUE),
    pct_high_educ = mean(d8r2 >= 3, na.rm = TRUE) * 100,
    pct_rural = mean(urban_rural == 1, na.rm = TRUE) * 100,
    pct_urban = mean(urban_rural == 2, na.rm = TRUE) * 100,
    pct_large_urban = mean(urban_rural == 3, na.rm = TRUE) * 100,
    pct_financial_difficulty = mean(financial_insecurity <= 2, na.rm = TRUE) * 100,
    pct_working_class = mean(social_class == 1, na.rm = TRUE) * 100,
    pct_middle_class = mean(social_class %in% c(2,3,4), na.rm = TRUE) * 100,
    pct_upper_class = mean(social_class == 5, na.rm = TRUE) * 100,
    
    # values and identity
    mean_political_ideology = mean(political_ideology, na.rm = TRUE),
    pct_left = mean(political_ideology <= 3, na.rm = TRUE) * 100,
    pct_center = mean(political_ideology > 3 & political_ideology < 8, na.rm = TRUE) * 100,
    pct_right = mean(political_ideology >= 8, na.rm = TRUE) * 100,
    
    # religious composition
    pct_religious = mean(religion %in% c(1:11), na.rm = TRUE) * 100,
    pct_catholic = mean(religion == 1, na.rm = TRUE) * 100,
    pct_orthodox = mean(religion == 2, na.rm = TRUE) * 100,
    pct_protestant = mean(religion == 3, na.rm = TRUE) * 100,
    pct_other_christian = mean(religion == 4, na.rm = TRUE) * 100,
    pct_muslim = mean(religion %in% c(6:8), na.rm = TRUE) * 100,
    pct_other_religion = mean(religion %in% c(9:11), na.rm = TRUE) * 100,
    pct_atheist = mean(religion == 13, na.rm = TRUE) * 100,
    pct_nonbeliever = mean(religion == 14, na.rm = TRUE) * 100,
    pct_nonreligious = mean(religion %in% c(13,14), na.rm = TRUE) * 100,
    
    # social networks and contact
    pct_lgb_friends = mean(lgb_friends == 1, na.rm = TRUE) * 100,
    pct_trans_friends = mean(trans_friends == 1, na.rm = TRUE) * 100,
    
    # personal identity/minority status
    pct_ethnic_minority = mean(ethnic_minority == 1, na.rm = TRUE) * 100,
    pct_skin_color_minority = mean(skin_color_minority == 1, na.rm = TRUE) * 100,
    pct_religious_minority = mean(religious_minority == 1, na.rm = TRUE) * 100,
    pct_sexual_minority = mean(sexual_lgbt_minority == 1, na.rm = TRUE) * 100,
    pct_disability_minority = mean(disability_minority == 1, na.rm = TRUE) * 100,
    pct_other_minority = mean(other_minority == 1, na.rm = TRUE) * 100,
    pct_no_minority = mean(none_minority == 1, na.rm = TRUE) * 100,
    
    # discrimination perceptions; we need to be careful with interpretation/ coding
    mean_trans_discrim_country = mean(trans_discrimination_country, na.rm = TRUE),
    pct_perceive_trans_discrim = mean(trans_discrimination_country %in% c(1,2), na.rm = TRUE) * 100,
    pct_experienced_trans_discrim = mean(trans_discrimination_personal == 1, na.rm = TRUE) * 100,
    pct_workplace_trans_discrim = mean(trans_discrimination_workplace == 1, na.rm = TRUE) * 100,
    
    # political representation comfort - original scale (1-10, higher = more comfortable)
    mean_trans_political_comfort = mean(trans_discrimination_political, na.rm = TRUE),
    pct_comfortable_trans_political = mean(trans_discrimination_political >= 7, na.rm = TRUE) * 100,
    
    # effectiveness of discrimination efforts (original scale: 1-10, higher = more effective)
    mean_country_discrimination_efforts = mean(country_discrimination_efforts, na.rm = TRUE),
    
    # workplace diversity (1=Yes definitely, 4=No definitely not)
    mean_trans_workplace_diversity = mean(trans_workplace_diversity, na.rm = TRUE),
    pct_support_trans_workplace_diversity = mean(trans_workplace_diversity %in% c(1,2), na.rm = TRUE) * 100,
    
    # LGBT attitudes - REVERSED for these scales where 1=Totally agree, 4=Totally disagree
    # Create reversed versions so higher = more supportive
    mean_lgb_rights_rev = mean(case_when(
      lgb_rights == 1 ~ 4,
      lgb_rights == 2 ~ 3,
      lgb_rights == 3 ~ 2,
      lgb_rights == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_same_sex_relationship_rev = mean(case_when(
      same_sex_relationship == 1 ~ 4,
      same_sex_relationship == 2 ~ 3,
      same_sex_relationship == 3 ~ 2,
      same_sex_relationship == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_same_sex_marriage_rev = mean(case_when(
      same_sex_marriage == 1 ~ 4,
      same_sex_marriage == 2 ~ 3,
      same_sex_marriage == 3 ~ 2,
      same_sex_marriage == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    # also calculate percentage who agree with LGBT rights (original values 1 or 2)
    pct_support_lgb_rights = mean(lgb_rights %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_same_sex_relationship = mean(same_sex_relationship %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_same_sex_marriage = mean(same_sex_marriage %in% c(1,2), na.rm = TRUE) * 100,
    
    # school materials (1=Totally agree, 4=Totally disagree)
    # Create reversed versions so higher = more supportive
    mean_lgb_school_materials_rev = mean(case_when(
      lgb_school_materials == 1 ~ 4,
      lgb_school_materials == 2 ~ 3,
      lgb_school_materials == 3 ~ 2,
      lgb_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_trans_school_materials_rev = mean(case_when(
      trans_school_materials == 1 ~ 4,
      trans_school_materials == 2 ~ 3,
      trans_school_materials == 3 ~ 2,
      trans_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_intersex_school_materials_rev = mean(case_when(
      intersex_school_materials == 1 ~ 4,
      intersex_school_materials == 2 ~ 3,
      intersex_school_materials == 3 ~ 2,
      intersex_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    # also calculate percentage who agree with inclusive school materials
    pct_support_lgb_school_materials = mean(lgb_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_trans_school_materials = mean(trans_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_intersex_school_materials = mean(intersex_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    
    # comfort with same-sex public displays of affection (1-10)
    mean_men_pda_comfort = mean(two_men_public_affection, na.rm = TRUE),
    mean_women_pda_comfort = mean(two_women_public_affection, na.rm = TRUE),
    pct_comfortable_men_pda = mean(two_men_public_affection >= 7, na.rm = TRUE) * 100,
    pct_comfortable_women_pda = mean(two_women_public_affection >= 7, na.rm = TRUE) * 100,
    
    # comfort with transgender colleagues/children (1-10)
    mean_trans_colleague_comfort = mean(trans_colleague, na.rm = TRUE),
    mean_trans_child_relationship = mean(trans_child_relationship, na.rm = TRUE),
    pct_comfortable_trans_colleague = mean(trans_colleague >= 7, na.rm = TRUE) * 100,
    pct_comfortable_trans_child = mean(trans_child_relationship >= 7, na.rm = TRUE) * 100,
    
    # support for non-gendered documents
    pct_support_nongendered_docs = mean(non_gendered_docs == 1, na.rm = TRUE) * 100,
    pct_oppose_nongendered_docs = mean(non_gendered_docs == 2, na.rm = TRUE) * 100,
    pct_dk_nongendered_docs = mean(non_gendered_docs == 3, na.rm = TRUE) * 100) %>%
  ungroup()
## `summarise()` has grouped output by 'country_name'. You can override using the
## `.groups` argument.
# add country-level variables from the original dataset
# extract country-level variables that don't need to be aggregated
country_level_variables <- df_reduced %>%
  select(country_name, gdp_2018, rainbow_score_2019, rainbow_score_2018, 
         gender_equality_index, Happiness_Score, 
         v2x_libdem, v2x_egaldem, Regime_type, composite_equality, 
         z_composite_equality, norm_composite_equality, z_lgbt_support, 
         norm_lgbt_support, pct_friends_lgbt, z_pct_friends_lgbt, 
         norm_pct_friends_lgbt, mean_left_right, pct_high_education, region) %>%
  distinct()

# merge with aggregated data
df_reduced_aggregated_complete <- df_reduced_aggregated %>%
  left_join(country_level_variables, by = "country_name")

# what we could add
setdiff(colnames(country_data_combined), country_level_variables)
##   [1] "country_name"                        
##   [2] "iso2"                                
##   [3] "v2x_libdem"                          
##   [4] "v2x_polyarchy"                       
##   [5] "v2x_gender"                          
##   [6] "v2x_egaldem"                         
##   [7] "v2x_liberal"                         
##   [8] "v2xcs_ccsi"                          
##   [9] "v2x_freexp"                          
##  [10] "Overall_score"                       
##  [11] "Electoral_process_and_pluralism"     
##  [12] "Functioning_of_government"           
##  [13] "Political_participation"             
##  [14] "Political_culture"                   
##  [15] "Civil_liberties"                     
##  [16] "Regime_type"                         
##  [17] "gdp_2005"                            
##  [18] "gdp_2018"                            
##  [19] "gdp_growth"                          
##  [20] "rainbow_score_2019"                  
##  [21] "rainbow_score_2018"                  
##  [22] "rainbow_score_avg_2019_2018"         
##  [23] "rainbow_score_avg_2013_2014"         
##  [24] "rainbow_score_difference"            
##  [25] "Happiness_Score"                     
##  [26] "Unemployment"                        
##  [27] "Employment"                          
##  [28] "gender_equality_index"               
##  [29] "z_v2x_libdem"                        
##  [30] "z_v2x_polyarchy"                     
##  [31] "z_v2x_gender"                        
##  [32] "z_v2x_egaldem"                       
##  [33] "z_v2x_liberal"                       
##  [34] "z_v2xcs_ccsi"                        
##  [35] "z_v2x_freexp"                        
##  [36] "z_gdp_2018"                          
##  [37] "log_gdp_2018"                        
##  [38] "z_gdp_growth"                        
##  [39] "z_unemployment"                      
##  [40] "z_rainbow_score"                     
##  [41] "norm_rainbow_score"                  
##  [42] "z_lgbt_support"                      
##  [43] "norm_lgbt_support"                   
##  [44] "z_happiness"                         
##  [45] "z_gender_equality"                   
##  [46] "norm_gender_equality"                
##  [47] "z_functioning_of_government"         
##  [48] "composite_equality"                  
##  [49] "composite_democracy"                 
##  [50] "region"                              
##  [51] "isocntry"                            
##  [52] "mean_age"                            
##  [53] "median_age"                          
##  [54] "mean_life_satisfaction"              
##  [55] "pct_satisfied"                       
##  [56] "mean_natl_political_discuss"         
##  [57] "mean_eu_political_discuss"           
##  [58] "mean_local_political_discuss"        
##  [59] "pct_discuss_natl_politics"           
##  [60] "pct_discuss_eu_politics"             
##  [61] "pct_discuss_local_politics"          
##  [62] "mean_trade_support"                  
##  [63] "pct_trade_support"                   
##  [64] "pct_pro_globalization"               
##  [65] "pct_anti_globalization"              
##  [66] "mean_eu_trade_support"               
##  [67] "pct_support_eu_trade"                
##  [68] "mean_esg_support"                    
##  [69] "pct_support_esg"                     
##  [70] "mean_left_right"                     
##  [71] "pct_left"                            
##  [72] "pct_center"                          
##  [73] "pct_right"                           
##  [74] "mean_education_years"                
##  [75] "median_education_years"              
##  [76] "pct_high_education"                  
##  [77] "pct_rural"                           
##  [78] "pct_urban"                           
##  [79] "pct_large_urban"                     
##  [80] "mean_financial_difficulty"           
##  [81] "pct_financial_difficulty"            
##  [82] "mean_subjective_class"               
##  [83] "pct_working_class"                   
##  [84] "pct_middle_class"                    
##  [85] "pct_upper_class"                     
##  [86] "mean_voice_in_eu"                    
##  [87] "mean_voice_in_country"               
##  [88] "pct_voice_in_eu"                     
##  [89] "pct_voice_in_country"                
##  [90] "pct_friends_diff_ethnic"             
##  [91] "pct_friends_diff_skin"               
##  [92] "pct_friends_roma"                    
##  [93] "pct_friends_lgbt"                    
##  [94] "pct_friends_disabled"                
##  [95] "pct_friends_diff_religion"           
##  [96] "pct_friends_transgender"             
##  [97] "pct_friends_intersex"                
##  [98] "pct_ethnic_minority"                 
##  [99] "pct_skin_minority"                   
## [100] "pct_religious_minority"              
## [101] "pct_roma"                            
## [102] "pct_sexual_minority"                 
## [103] "pct_disability"                      
## [104] "pct_other_minority"                  
## [105] "pct_any_minority"                    
## [106] "pct_catholic"                        
## [107] "pct_orthodox"                        
## [108] "pct_protestant"                      
## [109] "pct_other_christian"                 
## [110] "pct_jewish"                          
## [111] "pct_muslim"                          
## [112] "pct_atheist"                         
## [113] "pct_nonbeliever"                     
## [114] "pct_nonreligious"                    
## [115] "mean_discrim_ethnic"                 
## [116] "mean_discrim_skin"                   
## [117] "mean_discrim_roma"                   
## [118] "mean_discrim_lgbt"                   
## [119] "mean_discrim_age"                    
## [120] "mean_discrim_religion"               
## [121] "mean_discrim_disability"             
## [122] "mean_discrim_transgender"            
## [123] "mean_discrim_gender"                 
## [124] "mean_discrim_intersex"               
## [125] "z_Overall_score"                     
## [126] "z_Electoral_process_and_pluralism"   
## [127] "z_Functioning_of_government"         
## [128] "z_Political_participation"           
## [129] "z_Political_culture"                 
## [130] "z_Civil_liberties"                   
## [131] "z_gdp_2005"                          
## [132] "z_rainbow_score_2019"                
## [133] "z_rainbow_score_2018"                
## [134] "z_rainbow_score_avg_2019_2018"       
## [135] "z_rainbow_score_avg_2013_2014"       
## [136] "z_rainbow_score_difference"          
## [137] "z_Happiness_Score"                   
## [138] "z_Unemployment"                      
## [139] "z_Employment"                        
## [140] "z_gender_equality_index"             
## [141] "z_log_gdp_2018"                      
## [142] "z_composite_equality"                
## [143] "z_composite_democracy"               
## [144] "z_mean_age"                          
## [145] "z_median_age"                        
## [146] "z_mean_life_satisfaction"            
## [147] "z_pct_satisfied"                     
## [148] "z_mean_natl_political_discuss"       
## [149] "z_mean_eu_political_discuss"         
## [150] "z_mean_local_political_discuss"      
## [151] "z_pct_discuss_natl_politics"         
## [152] "z_pct_discuss_eu_politics"           
## [153] "z_pct_discuss_local_politics"        
## [154] "z_mean_trade_support"                
## [155] "z_pct_trade_support"                 
## [156] "z_pct_pro_globalization"             
## [157] "z_pct_anti_globalization"            
## [158] "z_mean_eu_trade_support"             
## [159] "z_pct_support_eu_trade"              
## [160] "z_mean_esg_support"                  
## [161] "z_pct_support_esg"                   
## [162] "z_mean_left_right"                   
## [163] "z_pct_left"                          
## [164] "z_pct_center"                        
## [165] "z_pct_right"                         
## [166] "z_mean_education_years"              
## [167] "z_median_education_years"            
## [168] "z_pct_high_education"                
## [169] "z_pct_rural"                         
## [170] "z_pct_urban"                         
## [171] "z_pct_large_urban"                   
## [172] "z_mean_financial_difficulty"         
## [173] "z_pct_financial_difficulty"          
## [174] "z_mean_subjective_class"             
## [175] "z_pct_working_class"                 
## [176] "z_pct_middle_class"                  
## [177] "z_pct_upper_class"                   
## [178] "z_mean_voice_in_eu"                  
## [179] "z_pct_voice_in_eu"                   
## [180] "z_pct_friends_diff_ethnic"           
## [181] "z_pct_friends_diff_skin"             
## [182] "z_pct_friends_roma"                  
## [183] "z_pct_friends_lgbt"                  
## [184] "z_pct_friends_disabled"              
## [185] "z_pct_friends_diff_religion"         
## [186] "z_pct_friends_transgender"           
## [187] "z_pct_friends_intersex"              
## [188] "z_pct_ethnic_minority"               
## [189] "z_pct_skin_minority"                 
## [190] "z_pct_religious_minority"            
## [191] "z_pct_roma"                          
## [192] "z_pct_sexual_minority"               
## [193] "z_pct_disability"                    
## [194] "z_pct_other_minority"                
## [195] "z_pct_any_minority"                  
## [196] "z_pct_catholic"                      
## [197] "z_pct_orthodox"                      
## [198] "z_pct_protestant"                    
## [199] "z_pct_other_christian"               
## [200] "z_pct_jewish"                        
## [201] "z_pct_muslim"                        
## [202] "z_pct_atheist"                       
## [203] "z_pct_nonbeliever"                   
## [204] "z_pct_nonreligious"                  
## [205] "z_mean_discrim_ethnic"               
## [206] "z_mean_discrim_skin"                 
## [207] "z_mean_discrim_roma"                 
## [208] "z_mean_discrim_lgbt"                 
## [209] "z_mean_discrim_age"                  
## [210] "z_mean_discrim_religion"             
## [211] "z_mean_discrim_disability"           
## [212] "z_mean_discrim_transgender"          
## [213] "z_mean_discrim_gender"               
## [214] "z_mean_discrim_intersex"             
## [215] "norm_v2x_libdem"                     
## [216] "norm_v2x_polyarchy"                  
## [217] "norm_v2x_gender"                     
## [218] "norm_v2x_egaldem"                    
## [219] "norm_v2x_liberal"                    
## [220] "norm_v2xcs_ccsi"                     
## [221] "norm_v2x_freexp"                     
## [222] "norm_Overall_score"                  
## [223] "norm_Electoral_process_and_pluralism"
## [224] "norm_Functioning_of_government"      
## [225] "norm_Political_participation"        
## [226] "norm_Political_culture"              
## [227] "norm_Civil_liberties"                
## [228] "norm_gdp_2005"                       
## [229] "norm_gdp_2018"                       
## [230] "norm_gdp_growth"                     
## [231] "norm_rainbow_score_2019"             
## [232] "norm_rainbow_score_2018"             
## [233] "norm_rainbow_score_avg_2019_2018"    
## [234] "norm_rainbow_score_avg_2013_2014"    
## [235] "norm_rainbow_score_difference"       
## [236] "norm_Happiness_Score"                
## [237] "norm_Unemployment"                   
## [238] "norm_Employment"                     
## [239] "norm_gender_equality_index"          
## [240] "norm_log_gdp_2018"                   
## [241] "norm_composite_equality"             
## [242] "norm_composite_democracy"            
## [243] "norm_mean_age"                       
## [244] "norm_median_age"                     
## [245] "norm_mean_life_satisfaction"         
## [246] "norm_pct_satisfied"                  
## [247] "norm_mean_natl_political_discuss"    
## [248] "norm_mean_eu_political_discuss"      
## [249] "norm_mean_local_political_discuss"   
## [250] "norm_pct_discuss_natl_politics"      
## [251] "norm_pct_discuss_eu_politics"        
## [252] "norm_pct_discuss_local_politics"     
## [253] "norm_mean_trade_support"             
## [254] "norm_pct_trade_support"              
## [255] "norm_pct_pro_globalization"          
## [256] "norm_pct_anti_globalization"         
## [257] "norm_mean_eu_trade_support"          
## [258] "norm_pct_support_eu_trade"           
## [259] "norm_mean_esg_support"               
## [260] "norm_pct_support_esg"                
## [261] "norm_mean_left_right"                
## [262] "norm_pct_left"                       
## [263] "norm_pct_center"                     
## [264] "norm_pct_right"                      
## [265] "norm_mean_education_years"           
## [266] "norm_median_education_years"         
## [267] "norm_pct_high_education"             
## [268] "norm_pct_rural"                      
## [269] "norm_pct_urban"                      
## [270] "norm_pct_large_urban"                
## [271] "norm_mean_financial_difficulty"      
## [272] "norm_pct_financial_difficulty"       
## [273] "norm_mean_subjective_class"          
## [274] "norm_pct_working_class"              
## [275] "norm_pct_middle_class"               
## [276] "norm_pct_upper_class"                
## [277] "norm_mean_voice_in_eu"               
## [278] "norm_pct_voice_in_eu"                
## [279] "norm_pct_friends_diff_ethnic"        
## [280] "norm_pct_friends_diff_skin"          
## [281] "norm_pct_friends_roma"               
## [282] "norm_pct_friends_lgbt"               
## [283] "norm_pct_friends_disabled"           
## [284] "norm_pct_friends_diff_religion"      
## [285] "norm_pct_friends_transgender"        
## [286] "norm_pct_friends_intersex"           
## [287] "norm_pct_ethnic_minority"            
## [288] "norm_pct_skin_minority"              
## [289] "norm_pct_religious_minority"         
## [290] "norm_pct_roma"                       
## [291] "norm_pct_sexual_minority"            
## [292] "norm_pct_disability"                 
## [293] "norm_pct_other_minority"             
## [294] "norm_pct_any_minority"               
## [295] "norm_pct_catholic"                   
## [296] "norm_pct_orthodox"                   
## [297] "norm_pct_protestant"                 
## [298] "norm_pct_other_christian"            
## [299] "norm_pct_jewish"                     
## [300] "norm_pct_muslim"                     
## [301] "norm_pct_atheist"                    
## [302] "norm_pct_nonbeliever"                
## [303] "norm_pct_nonreligious"               
## [304] "norm_mean_discrim_ethnic"            
## [305] "norm_mean_discrim_skin"              
## [306] "norm_mean_discrim_roma"              
## [307] "norm_mean_discrim_lgbt"              
## [308] "norm_mean_discrim_age"               
## [309] "norm_mean_discrim_religion"          
## [310] "norm_mean_discrim_disability"        
## [311] "norm_mean_discrim_transgender"       
## [312] "norm_mean_discrim_gender"            
## [313] "norm_mean_discrim_intersex"
country_data_combined_add <- country_data_combined %>%
  select(iso2, norm_gdp_growth, norm_Unemployment, pct_trade_support, norm_gdp_2018,
         z_gender_equality, z_rainbow_score, z_v2x_libdem, z_v2x_egaldem, z_happiness,
         norm_gdp_growth, z_v2x_gender)

df_reduced_aggregated_complete <- df_reduced_aggregated_complete %>%
  left_join(country_data_combined_add, by = c("isocntry" = "iso2"))

# Check the final dataset
dim(df_reduced_aggregated_complete)
## [1]  28 100
# save it
write_rds(df_reduced_aggregated_complete, "df_reduced_aggregated_complete")


### create different plots
# calculate correlation matrix for numeric variables
cor_matrix <- cor(select_if(df_reduced_aggregated_complete, is.numeric), 
                  use = "pairwise.complete.obs")
## Warning in cor(select_if(df_reduced_aggregated_complete, is.numeric), use =
## "pairwise.complete.obs"): the standard deviation is zero
# (1) visualize complete correlation matrix
corrplot(cor_matrix, method = "color", type = "upper", 
         diag = F,
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.7, tl.cex = 0.7)

# (2) create custom function to analyze correlations with outcome
analyze_correlations_with_outcome <- function(data, outcome_var, min_correlation = 0.3) {
  # numeric columns only
  numeric_data <- data %>% select(where(is.numeric))
  
  # calculate correlations with outcome
  cors <- cor(numeric_data, numeric_data[[outcome_var]], use = "pairwise.complete.obs")
  
  # convert to data frame
  cor_df <- data.frame(
    variable = rownames(cors),
    correlation = cors[,1]) %>%
    # remove the outcome variable correlating with itself
    filter(variable != outcome_var) %>%
    # sort by absolute correlation
    arrange(desc(abs(correlation)))
  
  # filter by minimum correlation threshold
  cor_df <- cor_df %>% filter(abs(correlation) >= min_correlation)
  
  return(cor_df)
}

# run correlation analysis for transgender support
trans_support_correlations <- analyze_correlations_with_outcome(
  df_reduced_aggregated_complete, "pct_support_trans", min_correlation = 0.25)
## Warning in cor(numeric_data, numeric_data[[outcome_var]], use =
## "pairwise.complete.obs"): the standard deviation is zero
# display results
print(trans_support_correlations)
##                                                                    variable
## pct_oppose_trans                                           pct_oppose_trans
## pct_support_nongendered_docs                   pct_support_nongendered_docs
## pct_oppose_nongendered_docs                     pct_oppose_nongendered_docs
## pct_support_lgb_school_materials           pct_support_lgb_school_materials
## z_lgbt_support                                               z_lgbt_support
## norm_lgbt_support                                         norm_lgbt_support
## pct_support_lgb_rights                               pct_support_lgb_rights
## mean_lgb_rights_rev                                     mean_lgb_rights_rev
## pct_support_intersex_school_materials pct_support_intersex_school_materials
## pct_support_trans_school_materials       pct_support_trans_school_materials
## pct_support_same_sex_marriage                 pct_support_same_sex_marriage
## pct_support_same_sex_relationship         pct_support_same_sex_relationship
## pct_comfortable_trans_colleague             pct_comfortable_trans_colleague
## mean_trans_colleague_comfort                   mean_trans_colleague_comfort
## mean_lgb_school_materials_rev                 mean_lgb_school_materials_rev
## mean_same_sex_marriage_rev                       mean_same_sex_marriage_rev
## mean_same_sex_relationship_rev               mean_same_sex_relationship_rev
## mean_intersex_school_materials_rev       mean_intersex_school_materials_rev
## mean_trans_school_materials_rev             mean_trans_school_materials_rev
## mean_men_pda_comfort                                   mean_men_pda_comfort
## norm_pct_friends_lgbt                                 norm_pct_friends_lgbt
## pct_lgb_friends                                             pct_lgb_friends
## pct_friends_lgbt                                           pct_friends_lgbt
## z_pct_friends_lgbt                                       z_pct_friends_lgbt
## pct_comfortable_men_pda                             pct_comfortable_men_pda
## mean_women_pda_comfort                               mean_women_pda_comfort
## pct_comfortable_trans_political             pct_comfortable_trans_political
## pct_comfortable_women_pda                         pct_comfortable_women_pda
## mean_trans_political_comfort                   mean_trans_political_comfort
## mean_trans_child_relationship                 mean_trans_child_relationship
## pct_comfortable_trans_child                     pct_comfortable_trans_child
## z_v2x_egaldem                                                 z_v2x_egaldem
## v2x_egaldem                                                     v2x_egaldem
## composite_equality                                       composite_equality
## z_composite_equality                                   z_composite_equality
## norm_composite_equality                             norm_composite_equality
## rainbow_score_2018                                       rainbow_score_2018
## z_v2x_libdem                                                   z_v2x_libdem
## v2x_libdem                                                       v2x_libdem
## rainbow_score_2019                                       rainbow_score_2019
## z_rainbow_score                                             z_rainbow_score
## z_gender_equality                                         z_gender_equality
## gender_equality_index                                 gender_equality_index
## pct_right                                                         pct_right
## pct_trans_friends                                         pct_trans_friends
## mean_political_ideology                             mean_political_ideology
## mean_left_right                                             mean_left_right
## Happiness_Score                                             Happiness_Score
## z_happiness                                                     z_happiness
## norm_gdp_2018                                                 norm_gdp_2018
## gdp_2018                                                           gdp_2018
## pct_center                                                       pct_center
## norm_gdp_growth                                             norm_gdp_growth
## mean_trans_discrim_country                       mean_trans_discrim_country
## pct_perceive_trans_discrim                       pct_perceive_trans_discrim
## pct_high_education                                       pct_high_education
## pct_high_educ                                                 pct_high_educ
## pct_workplace_trans_discrim                     pct_workplace_trans_discrim
## pct_financial_difficulty                           pct_financial_difficulty
## pct_female                                                       pct_female
## pct_left                                                           pct_left
## mean_age                                                           mean_age
## pct_atheist                                                     pct_atheist
## pct_trade_support                                         pct_trade_support
## z_v2x_gender                                                   z_v2x_gender
## pct_orthodox                                                   pct_orthodox
## pct_religious                                                 pct_religious
## pct_nonreligious                                           pct_nonreligious
## pct_experienced_trans_discrim                 pct_experienced_trans_discrim
## pct_urban                                                         pct_urban
## pct_support_trans_workplace_diversity pct_support_trans_workplace_diversity
## pct_no_minority                                             pct_no_minority
## pct_large_urban                                             pct_large_urban
## pct_protestant                                               pct_protestant
## pct_other_religion                                       pct_other_religion
## pct_ethnic_minority                                     pct_ethnic_minority
## mean_country_discrimination_efforts     mean_country_discrimination_efforts
##                                       correlation
## pct_oppose_trans                       -1.0000000
## pct_support_nongendered_docs            0.9552632
## pct_oppose_nongendered_docs            -0.9552632
## pct_support_lgb_school_materials        0.9099915
## z_lgbt_support                          0.9055394
## norm_lgbt_support                       0.9055394
## pct_support_lgb_rights                  0.9054333
## mean_lgb_rights_rev                     0.8873981
## pct_support_intersex_school_materials   0.8853638
## pct_support_trans_school_materials      0.8828781
## pct_support_same_sex_marriage           0.8820719
## pct_support_same_sex_relationship       0.8799193
## pct_comfortable_trans_colleague         0.8795309
## mean_trans_colleague_comfort            0.8697944
## mean_lgb_school_materials_rev           0.8679107
## mean_same_sex_marriage_rev              0.8669420
## mean_same_sex_relationship_rev          0.8656658
## mean_intersex_school_materials_rev      0.8479568
## mean_trans_school_materials_rev         0.8459165
## mean_men_pda_comfort                    0.8398369
## norm_pct_friends_lgbt                   0.8350491
## pct_lgb_friends                         0.8350491
## pct_friends_lgbt                        0.8350491
## z_pct_friends_lgbt                      0.8350491
## pct_comfortable_men_pda                 0.8313883
## mean_women_pda_comfort                  0.8219693
## pct_comfortable_trans_political         0.8144761
## pct_comfortable_women_pda               0.8135901
## mean_trans_political_comfort            0.8080428
## mean_trans_child_relationship           0.8008226
## pct_comfortable_trans_child             0.7938314
## z_v2x_egaldem                           0.7909474
## v2x_egaldem                             0.7909474
## composite_equality                      0.7752634
## z_composite_equality                    0.7752634
## norm_composite_equality                 0.7752634
## rainbow_score_2018                      0.7539898
## z_v2x_libdem                            0.7147897
## v2x_libdem                              0.7147897
## rainbow_score_2019                      0.7094428
## z_rainbow_score                         0.7094428
## z_gender_equality                       0.7035475
## gender_equality_index                   0.7035475
## pct_right                              -0.6890892
## pct_trans_friends                       0.6648839
## mean_political_ideology                -0.6422553
## mean_left_right                        -0.6422553
## Happiness_Score                         0.6310032
## z_happiness                             0.6310032
## norm_gdp_2018                           0.5771817
## gdp_2018                                0.5771817
## pct_center                              0.5523491
## norm_gdp_growth                        -0.5522688
## mean_trans_discrim_country             -0.5172765
## pct_perceive_trans_discrim              0.5008079
## pct_high_education                      0.4980007
## pct_high_educ                           0.4948097
## pct_workplace_trans_discrim             0.4664769
## pct_financial_difficulty               -0.4504279
## pct_female                             -0.4496147
## pct_left                                0.4253939
## mean_age                                0.4017389
## pct_atheist                             0.4001147
## pct_trade_support                       0.3991507
## z_v2x_gender                            0.3977959
## pct_orthodox                           -0.3976753
## pct_religious                          -0.3825894
## pct_nonreligious                        0.3724599
## pct_experienced_trans_discrim          -0.3707182
## pct_urban                               0.3530711
## pct_support_trans_workplace_diversity   0.3494914
## pct_no_minority                         0.3146637
## pct_large_urban                        -0.3090152
## pct_protestant                          0.3082064
## pct_other_religion                      0.2785021
## pct_ethnic_minority                    -0.2752098
## mean_country_discrimination_efforts     0.2604523
# create a visualization of top correlations
ggplot(trans_support_correlations %>% head(15), 
       aes(x = reorder(variable, correlation), y = correlation)) +
  geom_bar(stat = "identity", aes(fill = correlation > 0)) +
  coord_flip() +
  labs(title = "Top correlated variables of transgender rights support",
       subtitle = "Country-level aggregated data",
       x = "Variables",
       y = "Correlation with % supporting transgender document changes") +
  theme_minimal() +
  scale_fill_manual(values = c("firebrick", "steelblue"), 
                    name = "Direction",
                    labels = c("Negative", "Positive"))

# (3) create correlation matrix for key variables
key_vars <- c("pct_support_trans", 
              trans_support_correlations$variable[1:25], # top 25 correlates
              "gdp_2018", "rainbow_score_2019", "gender_equality_index")

key_cor_matrix <- cor(df_reduced_aggregated_complete[, key_vars], 
                      use = "pairwise.complete.obs")

corrplot(key_cor_matrix, method = "color",
         order = "hclust", diag = F,
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.6, tl.cex = 0.7)

# (4) create correlation matrix for all country variables
key_vars_country <- c("pct_support_trans", "norm_gdp_2018", "rainbow_score_2019",
              "z_gender_equality", "norm_Unemployment", "norm_gdp_growth", 
              "z_v2x_libdem", "z_v2x_egaldem","z_happiness", "z_v2x_gender")

key_cor_matrix_country <- cor(df_reduced_aggregated_complete[, key_vars_country], 
                      use = "pairwise.complete.obs")

corrplot(key_cor_matrix_country, method = "color",
         order = "hclust", diag = F, addCoef.col = "black",
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.6, tl.cex = 0.7)

### 
# regional comparisons
region_summary <- df_reduced_aggregated_complete %>%
  group_by(region) %>%
  summarize(
    n_countries = n(),
    avg_trans_support = mean(pct_support_trans),
    min_support = min(pct_support_trans),
    max_support = max(pct_support_trans),
    std_dev = sd(pct_support_trans)) %>%
  arrange(desc(avg_trans_support))

# visualize regional differences
ggplot(df_reduced_aggregated_complete, aes(x = reorder(region, pct_support_trans, FUN = mean), 
                                           y = pct_support_trans)) +
  geom_boxplot(aes(fill = region)) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  coord_flip() +
  labs(title = "Support for Transgender Rights by European Region",
       x = "Region",
       y = "% Supporting Transgender Rights") +
  theme_minimal() +
  theme(legend.position = "none")

### some regression
# create a scatterplot matrix with regression lines
ggplot(df_reduced_aggregated_complete, 
       aes(x = pct_nonreligious, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region)) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
       x = "% Non-religious Population",
       y = "% Supporting Transgender Rights",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df_reduced_aggregated_complete, 
       aes(x = gender_equality_index, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region)) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Gender Equality and Transgender Rights Support",
       x = "Gender Equality Index",
       y = "% Supporting Transgender Rights",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

### clusters
cluster_vars <- c("pct_support_trans", "norm_gdp_2018", 
                  "rainbow_score_2019", "gender_equality_index", 
                  "v2x_libdem", "pct_catholic", "pct_nonreligious")

# Prepare data
cluster_data <- df_reduced_aggregated_complete %>%
  select(country_name, all_of(cluster_vars)) %>%
  na.omit() %>%
  column_to_rownames("country_name")

# Scale the data
cluster_data_scaled <- scale(cluster_data)

# Compute distance matrix
dist_matrix <- dist(cluster_data_scaled, method = "euclidean")

# Hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
plot(hc, main = "Clustering of Countries by Support for Transgender Rights",
     sub = "", xlab = "", cex = 0.7)
rect.hclust(hc, k = 4, border = "red")

### summary table
country_summary <- df_reduced_aggregated_complete %>%
  select(country_name, pct_support_trans, rainbow_score_2019, 
         gender_equality_index, norm_gdp_2018,
         pct_nonreligious, v2x_libdem, gdp_2018) %>%
  arrange(desc(pct_support_trans))

kable(country_summary, 
      col.names = c("Country", "Trans Rights Support (%)", "Rainbow Score", 
                    "Gender Equality", "LGBT Comfort", 
                    "Non-religious (%)", "Liberal Democracy", "GDP 2018"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(0, bold = TRUE) %>%
  add_header_above(c(" " = 1, "Support Measures" = 2, "Equality Measures" = 2, 
                     "Social Factors" = 2, "Economic" = 1))
Support Measures
Equality Measures
Social Factors
Economic
Country Trans Rights Support (%) Rainbow Score Gender Equality LGBT Comfort Non-religious (%) Liberal Democracy GDP 2018
Spain 89.1 58.5 68.3 0.2 21.7 0.8 41073.7
Netherlands 86.8 49.7 72.9 0.4 47.6 0.8 58818.0
Malta 82.2 74.7 60.1 0.3 4.8 0.6 48127.7
Denmark 79.5 60.3 76.8 0.4 14.2 0.9 57231.3
France 79.0 62.7 72.6 0.2 22.0 0.8 46397.5
Luxembourg 79.0 41.3 69.0 1.0 20.8 0.8 116334.0
Germany 78.9 47.5 65.5 0.3 28.1 0.8 56272.9
Portugal 75.4 57.5 56.0 0.1 9.7 0.8 34725.2
Ireland 74.1 44.7 69.5 0.7 10.0 0.8 86433.7
Sweden 73.1 52.9 82.6 0.3 37.2 0.9 53122.5
Belgium 73.0 70.4 70.5 0.3 24.9 0.8 52466.5
Finland 71.9 62.2 73.0 0.3 17.0 0.8 49242.7
United Kingdom 70.8 67.6 69.6 0.2 29.4 0.8 47108.0
Austria 61.8 48.5 63.3 0.4 21.5 0.8 56654.5
Greece 59.9 44.8 50.0 0.1 1.8 0.8 29792.0
Estonia 59.1 34.4 56.7 0.1 36.4 0.8 37201.1
Cyprus 53.5 22.0 55.1 0.2 2.0 0.7 40922.7
Slovenia 52.8 39.8 68.4 0.2 6.8 0.7 38656.2
Italy 49.9 19.4 62.1 0.2 12.2 0.8 43583.4
Poland 47.2 18.1 56.8 0.1 4.6 0.5 32360.9
Latvia 46.5 16.8 57.9 0.1 20.1 0.7 29832.1
Croatia 44.8 45.8 53.1 0.1 7.3 0.6 29043.9
Czech Republic 43.8 25.5 53.6 0.2 48.2 0.7 42016.4
Lithuania 41.7 19.9 56.8 0.1 7.0 0.8 36491.9
Slovakia 28.2 29.0 52.4 0.1 8.5 0.7 31514.2
Romania 23.6 20.7 52.4 0.1 1.6 0.6 29569.6
Bulgaria 16.7 25.8 58.0 0.0 4.3 0.5 24052.2
Hungary 16.2 42.8 50.8 0.1 17.4 0.4 32258.3
### check some bivariate relationships; choose three key demographic variblaes
# (1) age
ggplot(df_reduced, aes(x = age, y = as.numeric(qc19 == 1))) +
  geom_jitter(alpha = 0.1, height = 0.05) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Relationship Between Age and Transgender Rights Support",
       x = "Age", y = "Probability of Support") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# looks like age has a non-linear effect as the curve is flat for ages until ~60 and then falls off
# this suggests we should use a quadratic term for age

# (2) political ideology
ggplot(df_reduced, aes(x = political_ideology, y = as.numeric(qc19 == 1))) +
  geom_jitter(alpha = 0.1, height = 0.05) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Relationship Between Political Ideology and Transgender Rights Support",
       x = "Political Ideology (Left to Right)", 
       y = "Probability of Support") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# (3a) religion
ggplot(df_reduced_aggregated_complete, aes(x = pct_nonreligious, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region), alpha = 0.8) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 15) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
       subtitle = "Country-level data from Special Eurobarometer 493 (2019)",
       x = "% Non-religious Population",
       y = "% Supporting Transgender Document Changes",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal() 
## `geom_smooth()` using formula = 'y ~ x'

# (3b) religion disaggregated
religion_support <- df_reduced %>%
  mutate(religion_group = case_when(
    religion == 1 ~ "Catholic",
    religion == 2 ~ "Orthodox",
    religion == 3 ~ "Protestant",
    religion == 4 ~ "Other Christian",
    religion %in% c(6:8) ~ "Muslim",
    religion %in% c(9:11) ~ "Other Religion",
    religion == 13 ~ "Atheist",
    religion == 14 ~ "Non-believer",
    TRUE ~ "Other/Missing")) %>%
  group_by(religion_group) %>%
  summarize(
    support_rate = mean(qc19 == 1, na.rm = TRUE) * 100,
    sample_size = n(),
    se = sqrt((support_rate/100 * (1-support_rate/100)) / sample_size) * 100) %>%
  filter(religion_group != "Other/Missing", sample_size >= 30) %>%
  arrange(desc(support_rate))

ggplot(religion_support, aes(x = reorder(religion_group, support_rate), y = support_rate)) +
  geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
  geom_errorbar(aes(ymin = support_rate - 1.96*se, 
                    ymax = support_rate + 1.96*se),
                width = 0.2) +
  geom_text(aes(label = sprintf("%.1f%%", support_rate), y = support_rate + 3),
            vjust = 0) +
  geom_text(aes(label = paste0("n=", sample_size), y = -5),
            vjust = 1, size = 3) +
  coord_flip() +
  labs(title = "Support for Transgender Rights by Religious Affiliation",
       subtitle = "Percentage supporting transgender document changes by religion",
       x = NULL,
       y = "Support Rate (%)") +
  ylim(-5, max(religion_support$support_rate) + 10) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.y = element_text(face = "bold"))

### intermediate conclusion:
# based on the results, we should use the following variables in the multilevel
# regression model

# country-level variables: 
# z_rainbow_score, z_gender_equality, regions, z_v2x_libdem, z_v2z_egaldem, 
# norm_gdp_2018


### use ML for predictor selection (country variables)
# prepare data for random forest by selecting variables we considered before
df_reduced$region <- as.factor(df_reduced$region)
df_reduced$Regime_type <- as.factor(df_reduced$Regime_type)
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, ifelse(df_reduced$qc19 == 2, 0, NA)) %>%
  as.factor() # convert target to binary

# remove NAs due to the coding before
df_reduced <- df_reduced %>%
  na.omit()

# join
df_reduced_rf <- df_reduced %>%
  left_join(country_data_combined, by = "country_name") 

model_data <- df_reduced_rf %>%
  select(region.x, z_v2x_libdem, z_v2x_egaldem, z_v2x_freexp, z_v2x_gender, z_v2x_liberal,
         z_gender_equality_index, z_rainbow_score_2019, norm_gdp_2018, norm_Unemployment,
         norm_Happiness_Score, norm_gdp_growth, z_Functioning_of_government,
         z_Electoral_process_and_pluralism, qc19)

# split data into training and testing
set.seed(69420)
train_index <- createDataPartition(model_data$qc19, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

# train random forest
rf_model <- ranger(
  qc19 ~ ., # Exclude country as a predictor
  data = train_data,
  importance = "impurity", # Gini importance
  num.trees = 500,
  mtry = floor(sqrt(ncol(train_data) - 2)), # Default mtry for classification
  probability = TRUE)

# extract variable importance
var_importance <- importance(rf_model)
var_importance_df <- data.frame(
  Variable = names(var_importance),
  Importance = var_importance)
var_importance_df <- var_importance_df[order(var_importance_df$Importance, decreasing = TRUE), ]

# Plot variable importance
ggplot(var_importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Variable Importance from Random Forest",
       x = "Variables",
       y = "Importance")

Explanatory Modeling

Load libraries

library(tidyverse)
library(lme4)      # For multilevel modeling
library(lmerTest)  # For p-values in multilevel models
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(ggplot2)   # For data visualization
library(dplyr)     # For data manipulation
library(sjstats)   # For calculating ICC
## 
## Attaching package: 'sjstats'
## The following object is masked from 'package:survey':
## 
##     cv
library(sjPlot)    # For plotting model results
## #refugeeswelcome
library(modelsummary)
library(broom.mixed)  # For extracting mixed model statistics
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(rtf)
## 
## Attaching package: 'rtf'
## The following object is masked from 'package:purrr':
## 
##     done
## The following object is masked from 'package:tibble':
## 
##     view
library(haven)
library(MuMIn)
## 
## Attaching package: 'MuMIn'
## The following object is masked from 'package:ranger':
## 
##     importance
## The following object is masked from 'package:randomForest':
## 
##     importance
data <- read_dta("data/raw/ZA7575.dta")
df_reduced <- readRDS("df_reduced.rds")

# restore the old NAs and do CCA
df_reduced$qc19 <- data$qc19
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 3, NA, df_reduced$qc19)

df_reduced <- df_reduced %>%
  drop_na(qc19)
  
# quick data check
table(df_reduced$country_name)
## 
##        Austria        Belgium       Bulgaria        Croatia         Cyprus 
##            957           1014            738            912            399 
## Czech Republic        Denmark        Estonia        Finland         France 
##            919            930            803            883            915 
##        Germany         Greece        Hungary        Ireland          Italy 
##           1333            923            903            882            889 
##         Latvia      Lithuania     Luxembourg          Malta    Netherlands 
##            847            851            468            448            980 
##         Poland       Portugal        Romania       Slovakia       Slovenia 
##            818            911            905            886            920 
##          Spain         Sweden United Kingdom 
##            921            902            901

Visualize qc19 by country

# Calculate the percentage of "Yes" (1) responses for qc19 by country
country_means <- aggregate(qc19 == 1 ~ country_name, data = df_reduced, FUN = mean)

# Multiply by 100 to get percentage and then sort
country_means$percentage <- country_means$`qc19 == 1` * 100
country_means <- country_means[order(country_means$percentage), ]

# Create the plot
ggplot(country_means, aes(x = reorder(country_name, percentage), y = percentage)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(title = "Support for Transgender Rights by Country",
       subtitle = "Percentage answering 'Yes' to allowing transgender people to change civil documents",
       x = "Country",
       y = "Support (%)") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Prepare for GLM

# scale some of the variables (both categorical and continuous) with larger scales
df_reduced <- df_reduced %>%
  mutate(
    age_z = scale(age),
    religion_z = scale(as.numeric(religion)),
    political_ideology_z = scale(political_ideology),
    respondent_cooperation_z = scale(as.numeric(respondent_cooperation)),
    rainbow_score_z = scale(rainbow_score_2019),
    v2x_egaldem_z = scale(v2x_egaldem))

# Recode qc19 from 1 and 2 to 0 and 1 for the binary model
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, 0)

# Optimize the settings to avoid convergence problems in the glmer() functions
control = glmerControl(optimizer = "bobyqa", 
                       optCtrl = list(maxfun = 100000))

# Recode some of the variables such that higher values translate to a more positive
# attitude towards LGBT rights in general -> simplieies coefficient interpretation
df_reduced <- df_reduced %>% 
  mutate(trans_friends = ifelse(trans_friends == 1, 2, 1),  # trans_friends is sd1_7 (see Reduced dataset.R)
         lgb_friends = ifelse(lgb_friends == 1, 2, 1)) # lgb_friends is sd1_4 (see Reduced dataset.R)

1. Null model (intercept only)

# STEP 1: Null Model (Intercept-only model)
null_model <- glmer(qc19 ~ 1 + (1|country_name), family = "binomial", data = df_reduced)
summary(null_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: qc19 ~ 1 + (1 | country_name)
##    Data: df_reduced
## 
##      AIC      BIC   logLik deviance df.resid 
##  28212.6  28228.8 -14104.3  28208.6    24156 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0233 -0.9008  0.5066  0.6531  2.2755 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  country_name (Intercept) 0.9649   0.9823  
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.4495     0.1856   2.422   0.0154 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate ICC
performance::icc(null_model)
## # Intraclass Correlation Coefficient
## 
##     Adjusted ICC: 0.227
##   Unadjusted ICC: 0.227

The null model indicates a statistically significant baseline probability of support for transgender individuals obtaining official documents (intercept = 0.45, p = 0.015), with substantial country-level variation accounting for 22.7% of the total variance in this support (ICC = 0.227).

2. Individual-level predictors (level 1)

# STEP 2: Add Individual-Level Predictors (Level 1)
# starting with predictors that showed correlation with qc19
model1 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1|country_name), 
               family = "binomial", 
               control = control,
               data = df_reduced)
summary(model1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +  
##     trans_friends + respondent_cooperation_z + (1 | country_name)
##    Data: df_reduced
## Control: control
## 
##      AIC      BIC   logLik deviance df.resid 
##  26411.0  26483.8 -13196.5  26393.0    24149 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.1228 -0.7612  0.3617  0.6576  4.7443 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  country_name (Intercept) 0.6956   0.834   
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.79076    0.17981  -9.959  < 2e-16 ***
## age_z                    -0.08163    0.01613  -5.060 4.20e-07 ***
## gender                    0.38278    0.03068  12.477  < 2e-16 ***
## religion_z                0.10468    0.01767   5.924 3.14e-09 ***
## political_ideology_z     -0.13256    0.01552  -8.543  < 2e-16 ***
## lgb_friends               0.88187    0.03648  24.173  < 2e-16 ***
## trans_friends             0.40647    0.06001   6.773 1.26e-11 ***
## respondent_cooperation_z -0.27127    0.01610 -16.845  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age_z  gender rlgn_z pltc__ lgb_fr trns_f
## age_z       -0.070                                          
## gender      -0.246 -0.022                                   
## religion_z   0.014  0.119  0.098                            
## pltcl_dlgy_ -0.021  0.045  0.007  0.070                     
## lgb_friends -0.187  0.211 -0.043 -0.087  0.032              
## trans_frnds -0.297  0.036 -0.005 -0.025  0.015 -0.224       
## rspndnt_cp_ -0.021 -0.078 -0.023 -0.004 -0.005  0.068  0.017
# compare with the null_model
anova(null_model, model1)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
##            npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## null_model    2 28213 28229 -14104    28209                         
## model1        9 26411 26484 -13196    26393 1815.7  7  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
  list("Model 1" = null_model, 
       "Model 2" = model1),
  stars = TRUE,
  gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
Model 1 Model 2
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.449* -1.791***
(0.186) (0.180)
age_z -0.082***
(0.016)
gender 0.383***
(0.031)
religion_z 0.105***
(0.018)
political_ideology_z -0.133***
(0.016)
lgb_friends 0.882***
(0.036)
trans_friends 0.406***
(0.060)
respondent_cooperation_z -0.271***
(0.016)
SD (Intercept country_name) 0.982 0.834
Num.Obs. 24158 24158
AIC 28212.6 26411.0
BIC 28228.8 26483.8

Model 1 significantly improves fit over the null model (χ² = 1720.5, p < 0.001, lower AIC/BIC). Transgender documentation is positively associated with characteristics such as being female and having LGB/transgender friends, while negatively associated with age and respondent cooperation during the survey.

3. Country-level predictors (level 2)

# STEP 3: Add Country-Level Predictors (Level 2)

# We test three different models based on 'themes'. As we only have 28 level-2 units, 
# we should not add more than 2-3 country-level variables to each model --> overfitting

# Political/ Economic development model
model_econ_pol <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + 
                   trans_friends + respondent_cooperation_z +
                   scale(gdp_2018) + z_functioning_of_government + 
                   (1 + lgb_friends|country_name), 
                   family = "binomial", data = df_reduced)

# Social wellbeing model
model_wellbeing <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + 
                        trans_friends + respondent_cooperation_z +
                        scale(Happiness_Score) + scale(gender_equality_index) + 
                        (1 + lgb_friends|country_name), 
                        family = "binomial", data = df_reduced)

# LGBTQ/equality values model 
model_equality <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends +       respondent_cooperation_z +
                 rainbow_score_z + v2x_egaldem_z + (1|country_name), 
               family = "binomial", 
               control = control,
               data = df_reduced)

# decide which model is the best
anova(model_econ_pol, model_wellbeing, model_equality) # equality is the best
## Data: df_reduced
## Models:
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model_econ_pol: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + scale(gdp_2018) + z_functioning_of_government + (1 + lgb_friends | country_name)
## model_wellbeing: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + scale(Happiness_Score) + scale(gender_equality_index) + (1 + lgb_friends | country_name)
##                 npar   AIC   BIC logLik deviance Chisq Df Pr(>Chisq)
## model_equality    11 26382 26471 -13180    26360                    
## model_econ_pol    13 26401 26506 -13188    26375     0  2          1
## model_wellbeing   13 26404 26509 -13189    26378     0  0
# check whether the equality model can be improved
model_equality_2 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends +       respondent_cooperation_z +
                 rainbow_score_z + v2x_egaldem_z + z_functioning_of_government + 
                   (1|country_name), 
               family = "binomial", 
               control = control,
               data = df_reduced)

anova(model_equality, model_equality_2) # no improvement --> keep model_equality
## Data: df_reduced
## Models:
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model_equality_2: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + z_functioning_of_government + (1 | country_name)
##                  npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)
## model_equality     11 26382 26471 -13180    26360                     
## model_equality_2   12 26383 26480 -13180    26359 1.0893  1     0.2966
# test for multicollinearity among the two cultural country-level variables
vif_model <- lm(qc19 ~ rainbow_score_z + v2x_egaldem_z, data = df_reduced)
vif(vif_model)  # no multicollinearity issues --> keep model_equality as it is
## rainbow_score_z   v2x_egaldem_z 
##        1.179562        1.179562
# compare with the previous two models
anova(null_model, model1, model_equality)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
##                npar   AIC   BIC logLik deviance    Chisq Df Pr(>Chisq)    
## null_model        2 28213 28229 -14104    28209                           
## model1            9 26411 26484 -13196    26393 1815.669  7  < 2.2e-16 ***
## model_equality   11 26382 26471 -13180    26360   32.612  2  8.285e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
  list("Model 1" = null_model, 
       "Model 2" = model1,
       "Model 3" = model_equality),
  stars = TRUE,
  gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
Model 1 Model 2 Model 3
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.449* -1.791*** -1.763***
(0.186) (0.180) (0.122)
age_z -0.082*** -0.083***
(0.016) (0.016)
gender 0.383*** 0.384***
(0.031) (0.031)
religion_z 0.105*** 0.102***
(0.018) (0.018)
political_ideology_z -0.133*** -0.132***
(0.016) (0.016)
lgb_friends 0.882*** 0.876***
(0.036) (0.036)
trans_friends 0.406*** 0.407***
(0.060) (0.060)
respondent_cooperation_z -0.271*** -0.269***
(0.016) (0.016)
rainbow_score_z 0.354***
(0.091)
v2x_egaldem_z 0.480***
(0.095)
SD (Intercept country_name) 0.982 0.834 0.458
Num.Obs. 24158 24158 24158
AIC 28212.6 26411.0 26382.4
BIC 28228.8 26483.8 26471.4

Model 2 improves fit yet again by adding country-level predictors (χ² = 33.7, p < 0.001, AIC reduced by 29.7 points). The strongest predictors are LGB friends (0.88), egalitarian democracy (0.49), transgender friends (0.42), and LGBTQ legal protections (rainbow score, 0.37), while individual-level predictors remained consistent with Model 1.

4. Random slopes for individual-level predictors

# STEP 4: Add Random Slopes for Individual-Level Predictors
# Let's add random slopes for lgb_friends which has shown a strong effect
model3 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + 
                rainbow_score_z + v2x_egaldem_z + 
                (1 + lgb_friends|country_name), 
               family = "binomial", 
               data = df_reduced)
summary(model3)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +  
##     trans_friends + respondent_cooperation_z + rainbow_score_z +  
##     v2x_egaldem_z + (1 + lgb_friends | country_name)
##    Data: df_reduced
## 
##      AIC      BIC   logLik deviance df.resid 
##  26379.2  26484.4 -13176.6  26353.2    24145 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.8970 -0.7605  0.3654  0.6560  4.7324 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev. Corr 
##  country_name (Intercept) 0.28167  0.5307        
##               lgb_friends 0.02986  0.1728   -0.53
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.75034    0.13324 -13.136  < 2e-16 ***
## age_z                    -0.08369    0.01617  -5.177 2.25e-07 ***
## gender                    0.38596    0.03072  12.565  < 2e-16 ***
## religion_z                0.10190    0.01769   5.761 8.36e-09 ***
## political_ideology_z     -0.13125    0.01556  -8.437  < 2e-16 ***
## lgb_friends               0.87288    0.05021  17.385  < 2e-16 ***
## trans_friends             0.40446    0.06011   6.728 1.72e-11 ***
## respondent_cooperation_z -0.26967    0.01612 -16.734  < 2e-16 ***
## rainbow_score_z           0.36167    0.09015   4.012 6.02e-05 ***
## v2x_egaldem_z             0.47996    0.09428   5.091 3.57e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age_z  gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z       -0.098                                                        
## gender      -0.332 -0.022                                                 
## religion_z   0.018  0.120  0.098                                          
## pltcl_dlgy_ -0.031  0.046  0.007  0.070                                   
## lgb_friends -0.466  0.158 -0.029 -0.059  0.027                            
## trans_frnds -0.400  0.036 -0.005 -0.025  0.015 -0.166                     
## rspndnt_cp_ -0.028 -0.078 -0.024 -0.003 -0.004  0.048  0.018              
## ranbw_scr_z  0.030 -0.032  0.013 -0.003  0.014 -0.029 -0.008  0.001       
## v2x_egldm_z  0.006 -0.012  0.012 -0.020  0.011 -0.008  0.012  0.011 -0.346
# compare with previous three models
anova(null_model, model1, model_equality, model3)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
##                npar   AIC   BIC logLik deviance     Chisq Df Pr(>Chisq)    
## null_model        2 28213 28229 -14104    28209                            
## model1            9 26411 26484 -13196    26393 1815.6686  7  < 2.2e-16 ***
## model_equality   11 26382 26471 -13180    26360   32.6125  2  8.285e-08 ***
## model3           13 26379 26484 -13177    26353    7.1246  2    0.02837 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
  list("Model 1" = null_model, 
       "Model 2" = model1,
       "Model 3" = model_equality,
       "Model 4" = model3),
  stars = TRUE,
  gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
Model 1 Model 2 Model 3 Model 4
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.449* -1.791*** -1.763*** -1.750***
(0.186) (0.180) (0.122) (0.133)
age_z -0.082*** -0.083*** -0.084***
(0.016) (0.016) (0.016)
gender 0.383*** 0.384*** 0.386***
(0.031) (0.031) (0.031)
religion_z 0.105*** 0.102*** 0.102***
(0.018) (0.018) (0.018)
political_ideology_z -0.133*** -0.132*** -0.131***
(0.016) (0.016) (0.016)
lgb_friends 0.882*** 0.876*** 0.873***
(0.036) (0.036) (0.050)
trans_friends 0.406*** 0.407*** 0.404***
(0.060) (0.060) (0.060)
respondent_cooperation_z -0.271*** -0.269*** -0.270***
(0.016) (0.016) (0.016)
rainbow_score_z 0.354*** 0.362***
(0.091) (0.090)
v2x_egaldem_z 0.480*** 0.480***
(0.095) (0.094)
SD (Intercept country_name) 0.982 0.834 0.458 0.531
SD (lgb_friends country_name) 0.173
Cor (Intercept~lgb_friends country_name) -0.532
Num.Obs. 24158 24158 24158 24158
AIC 28212.6 26411.0 26382.4 26379.2
BIC 28228.8 26483.8 26471.4 26484.4

Model 3 slightly improves fit by adding random slopes for LGB friends (χ² = 7.92, p = 0.019). There is modest cross-country variation in the effect of having LGB friends (SD = 0.176), with a negative correlation (-0.48) between intercepts and slopes indicating stronger effects in countries with lower baseline support.

5. Cross-level interactions

# STEP 5: Add Cross-Level Interactions
# Assuming the random slope for lgb_friends is significant
# We'll test cross-level interactions with both country-level predictors
model4a <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + 
                rainbow_score_z + v2x_egaldem_z + 
                lgb_friends:rainbow_score_z +
                (1 + lgb_friends|country_name), 
               family = "binomial", data = df_reduced)
summary(model4a)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +  
##     trans_friends + respondent_cooperation_z + rainbow_score_z +  
##     v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends |  
##     country_name)
##    Data: df_reduced
## 
##      AIC      BIC   logLik deviance df.resid 
##  26381.2  26494.5 -13176.6  26353.2    24144 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.8950 -0.7604  0.3654  0.6560  4.7324 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev. Corr 
##  country_name (Intercept) 0.28176  0.5308        
##               lgb_friends 0.02985  0.1728   -0.53
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.7502822  0.1332809 -13.132  < 2e-16 ***
## age_z                       -0.0836921  0.0161660  -5.177 2.25e-07 ***
## gender                       0.3859664  0.0307213  12.563  < 2e-16 ***
## religion_z                   0.1019082  0.0176897   5.761 8.37e-09 ***
## political_ideology_z        -0.1312573  0.0155631  -8.434  < 2e-16 ***
## lgb_friends                  0.8728531  0.0502206  17.380  < 2e-16 ***
## trans_friends                0.4044711  0.0601146   6.728 1.72e-11 ***
## respondent_cooperation_z    -0.2696768  0.0161184 -16.731  < 2e-16 ***
## rainbow_score_z              0.3625479  0.1144465   3.168  0.00154 ** 
## v2x_egaldem_z                0.4799307  0.0943035   5.089 3.60e-07 ***
## lgb_friends:rainbow_score_z -0.0005866  0.0481628  -0.012  0.99028    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age_z  gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z       -0.098                                                        
## gender      -0.331 -0.022                                                 
## religion_z   0.018  0.120  0.098                                          
## pltcl_dlgy_ -0.031  0.046  0.007  0.070                                   
## lgb_friends -0.466  0.159 -0.030 -0.059  0.028                            
## trans_frnds -0.400  0.036 -0.005 -0.025  0.015 -0.167                     
## rspndnt_cp_ -0.029 -0.078 -0.024 -0.003 -0.003  0.049  0.017              
## ranbw_scr_z  0.038 -0.026  0.020  0.006 -0.007 -0.038 -0.002 -0.012       
## v2x_egldm_z  0.006 -0.012  0.012 -0.020  0.012 -0.007  0.012  0.011 -0.286
## lgb_frnd:__ -0.024  0.002 -0.016 -0.013  0.029  0.025 -0.008  0.020 -0.616
##             v2x_g_
## age_z             
## gender            
## religion_z        
## pltcl_dlgy_       
## lgb_friends       
## trans_frnds       
## rspndnt_cp_       
## ranbw_scr_z       
## v2x_egldm_z       
## lgb_frnd:__  0.022
# Alternative interaction with v2x_egaldem
model4b <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + 
                rainbow_score_z+ v2x_egaldem_z + 
                lgb_friends:v2x_egaldem_z +
                (1 + lgb_friends|country_name), 
                family = "binomial",
               data = df_reduced)
summary(model4b)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +  
##     trans_friends + respondent_cooperation_z + rainbow_score_z +  
##     v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends |  
##     country_name)
##    Data: df_reduced
## 
##      AIC      BIC   logLik deviance df.resid 
##  26380.7  26494.0 -13176.3  26352.7    24144 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.9289 -0.7612  0.3665  0.6548  4.7850 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev. Corr 
##  country_name (Intercept) 0.28103  0.5301        
##               lgb_friends 0.02859  0.1691   -0.53
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.74930    0.13319 -13.134  < 2e-16 ***
## age_z                     -0.08385    0.01617  -5.187 2.14e-07 ***
## gender                     0.38602    0.03072  12.566  < 2e-16 ***
## religion_z                 0.10193    0.01769   5.763 8.24e-09 ***
## political_ideology_z      -0.13156    0.01556  -8.453  < 2e-16 ***
## lgb_friends                0.87464    0.04979  17.568  < 2e-16 ***
## trans_friends              0.40301    0.06009   6.706 1.99e-11 ***
## respondent_cooperation_z  -0.26977    0.01612 -16.738  < 2e-16 ***
## rainbow_score_z            0.36184    0.09011   4.016 5.93e-05 ***
## v2x_egaldem_z              0.53462    0.11836   4.517 6.27e-06 ***
## lgb_friends:v2x_egaldem_z -0.03826    0.05046  -0.758    0.448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age_z  gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z       -0.098                                                        
## gender      -0.332 -0.022                                                 
## religion_z   0.018  0.120  0.098                                          
## pltcl_dlgy_ -0.031  0.047  0.007  0.070                                   
## lgb_friends -0.464  0.159 -0.030 -0.059  0.026                            
## trans_frnds -0.400  0.037 -0.005 -0.025  0.016 -0.169                     
## rspndnt_cp_ -0.028 -0.077 -0.024 -0.003 -0.004  0.048  0.018              
## ranbw_scr_z  0.030 -0.032  0.012 -0.003  0.014 -0.029 -0.008  0.001       
## v2x_egldm_z  0.019 -0.022  0.012 -0.016 -0.010  0.008 -0.010  0.004 -0.269
## lgb_frn:2__ -0.012  0.013 -0.004 -0.002  0.026 -0.046  0.033  0.009 -0.005
##             v2x_g_
## age_z             
## gender            
## religion_z        
## pltcl_dlgy_       
## lgb_friends       
## trans_frnds       
## rspndnt_cp_       
## ranbw_scr_z       
## v2x_egldm_z       
## lgb_frn:2__ -0.616
# compare with previous models
anova(null_model, model1, model_equality, model3, model4a, model4b)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## model4a: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends | country_name)
## model4b: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends | country_name)
##                npar   AIC   BIC logLik deviance     Chisq Df Pr(>Chisq)    
## null_model        2 28213 28229 -14104    28209                            
## model1            9 26411 26484 -13196    26393 1815.6686  7  < 2.2e-16 ***
## model_equality   11 26382 26471 -13180    26360   32.6125  2  8.285e-08 ***
## model3           13 26379 26484 -13177    26353    7.1246  2    0.02837 *  
## model4a          14 26381 26494 -13177    26353    0.0001  1    0.99024    
## model4b          14 26381 26494 -13176    26353    0.5685  0               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
  list("Model 1" = null_model, 
       "Model 2" = model1,
       "Model 3" = model_equality,
       "Model 4" = model3,
       "Model 5a" = model4a,
       "Model 5b" = model4b),
  stars = TRUE,
  gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
Model 1 Model 2 Model 3 Model 4 Model 5a Model 5b
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.449* -1.791*** -1.763*** -1.750*** -1.750*** -1.749***
(0.186) (0.180) (0.122) (0.133) (0.133) (0.133)
age_z -0.082*** -0.083*** -0.084*** -0.084*** -0.084***
(0.016) (0.016) (0.016) (0.016) (0.016)
gender 0.383*** 0.384*** 0.386*** 0.386*** 0.386***
(0.031) (0.031) (0.031) (0.031) (0.031)
religion_z 0.105*** 0.102*** 0.102*** 0.102*** 0.102***
(0.018) (0.018) (0.018) (0.018) (0.018)
political_ideology_z -0.133*** -0.132*** -0.131*** -0.131*** -0.132***
(0.016) (0.016) (0.016) (0.016) (0.016)
lgb_friends 0.882*** 0.876*** 0.873*** 0.873*** 0.875***
(0.036) (0.036) (0.050) (0.050) (0.050)
trans_friends 0.406*** 0.407*** 0.404*** 0.404*** 0.403***
(0.060) (0.060) (0.060) (0.060) (0.060)
respondent_cooperation_z -0.271*** -0.269*** -0.270*** -0.270*** -0.270***
(0.016) (0.016) (0.016) (0.016) (0.016)
rainbow_score_z 0.354*** 0.362*** 0.363** 0.362***
(0.091) (0.090) (0.114) (0.090)
v2x_egaldem_z 0.480*** 0.480*** 0.480*** 0.535***
(0.095) (0.094) (0.094) (0.118)
lgb_friends × rainbow_score_z -0.001
(0.048)
lgb_friends × v2x_egaldem_z -0.038
(0.050)
SD (Intercept country_name) 0.982 0.834 0.458 0.531 0.531 0.530
SD (lgb_friends country_name) 0.173 0.173 0.169
Cor (Intercept~lgb_friends country_name) -0.532 -0.532 -0.532
Num.Obs. 24158 24158 24158 24158 24158 24158
AIC 28212.6 26411.0 26382.4 26379.2 26381.2 26380.7
BIC 28228.8 26483.8 26471.4 26484.4 26494.5 26494.0

Testing cross-level interactions shows that neither the interaction between LGB friends and Rainbow score (p = 0.90) nor between LGB friends and egalitarian democracy (p = 0.41) is significant. Adding these interaction terms actually worsens model fit (AIC increases).

6. Compare models

# compare all models
anova(null_model, model1, model_equality, model3, model4a, model4b)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## model4a: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends | country_name)
## model4b: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends | country_name)
##                npar   AIC   BIC logLik deviance     Chisq Df Pr(>Chisq)    
## null_model        2 28213 28229 -14104    28209                            
## model1            9 26411 26484 -13196    26393 1815.6686  7  < 2.2e-16 ***
## model_equality   11 26382 26471 -13180    26360   32.6125  2  8.285e-08 ***
## model3           13 26379 26484 -13177    26353    7.1246  2    0.02837 *  
## model4a          14 26381 26494 -13177    26353    0.0001  1    0.99024    
## model4b          14 26381 26494 -13176    26353    0.5685  0               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# use Stargazer to display the output in a nice table
model_stats_extended <- function(models, model_names) {
  result <- data.frame(
    Model = character(),
    AIC = numeric(),
    BIC = numeric(),
    LogLik = numeric(),
    Deviance = numeric(),
    ChiSq = numeric(),
    Df = numeric(),
    p = numeric(),
    ICC = numeric(),
    stringsAsFactors = FALSE
  )
  
  # Extract basic statistics
  for (i in 1:length(models)) {
    model <- models[[i]]
    result[i, "Model"] <- model_names[i]
    result[i, "AIC"] <- AIC(model)
    result[i, "BIC"] <- BIC(model)
    result[i, "LogLik"] <- as.numeric(logLik(model))
    result[i, "Deviance"] <- deviance(model)
    result[i, "ICC"] <- performance::icc(model)$ICC_conditional
  }
  
  # Extract chi-square statistics from ANOVA comparison
  anova_result <- anova(models[[1]], models[[2]], models[[3]], models[[4]], models[[5]], models[[6]])
  
  for (i in 2:length(models)) {
    result[i, "ChiSq"] <- anova_result$Chisq[i]
    result[i, "Df"] <- anova_result$Df[i]
    result[i, "p"] <- anova_result$`Pr(>Chisq)`[i]
  }
  
  return(result)
}

# for the labels
model_names <- c("Null model", "Individual predictors", "Country-level predictors", 
                 "Random slopes", "Interaction (Rainbow)", "Interaction (Egalitarian)")

# extract statistics
all_models_stats <- model_stats_extended(
  list(null_model, model1, model_equality, model3, model4a, model4b),
  model_names)

# split into two separate tables because one would be too wide
main_models_table <- stargazer(null_model, model1, model_equality, model3, 
          type = "text",
          title = "Main Multilevel Models for Transgender Document Change Support",
          column.labels = model_names[1:4],
          digits = 3,
          star.cutoffs = c(0.05, 0.01, 0.001),
          intercept.bottom = FALSE,
          single.row = FALSE,
          model.numbers = FALSE,
          notes.append = FALSE,
          notes = "* p<0.05; ** p<0.01; *** p<0.001",
          add.lines = list(
            c("AIC", round(all_models_stats$AIC[1:4], 1)),
            c("BIC", round(all_models_stats$BIC[1:4], 1)),
            c("Log Likelihood", round(all_models_stats$LogLik[1:4], 1)),
            c("Chi-square", c("", round(all_models_stats$ChiSq[2:4], 2))),
            c("df", c("", all_models_stats$Df[2:4])),
            c("p-value", c("", format.pval(all_models_stats$p[2:4], digits = 3))),
            c("ICC", round(all_models_stats$ICC[1:4], 3))))
## 
## Main Multilevel Models for Transgender Document Change Support
## =================================================================================================
##                                                    Dependent variable:                           
##                          ------------------------------------------------------------------------
##                                                            qc19                                  
##                          Null model  Individual predictors Country-level predictors Random slopes
## -------------------------------------------------------------------------------------------------
## Constant                   0.449*          -1.791***              -1.763***           -1.750***  
##                            (0.186)          (0.180)                (0.122)             (0.133)   
##                                                                                                  
## age_z                                      -0.082***              -0.083***           -0.084***  
##                                             (0.016)                (0.016)             (0.016)   
##                                                                                                  
## gender                                     0.383***                0.384***           0.386***   
##                                             (0.031)                (0.031)             (0.031)   
##                                                                                                  
## religion_z                                 0.105***                0.102***           0.102***   
##                                             (0.018)                (0.018)             (0.018)   
##                                                                                                  
## political_ideology_z                       -0.133***              -0.132***           -0.131***  
##                                             (0.016)                (0.016)             (0.016)   
##                                                                                                  
## lgb_friends                                0.882***                0.876***           0.873***   
##                                             (0.036)                (0.036)             (0.050)   
##                                                                                                  
## trans_friends                              0.406***                0.407***           0.404***   
##                                             (0.060)                (0.060)             (0.060)   
##                                                                                                  
## respondent_cooperation_z                   -0.271***              -0.269***           -0.270***  
##                                             (0.016)                (0.016)             (0.016)   
##                                                                                                  
## rainbow_score_z                                                    0.354***           0.362***   
##                                                                    (0.091)             (0.090)   
##                                                                                                  
## v2x_egaldem_z                                                      0.480***           0.480***   
##                                                                    (0.095)             (0.094)   
##                                                                                                  
## -------------------------------------------------------------------------------------------------
## AIC                        28212.6           26411                 26382.4             26379.2   
## BIC                        28228.8          26483.8                26471.4             26484.4   
## Log Likelihood            -14104.3         -13196.5                -13180.2           -13176.6   
## Chi-square                                  1815.67                 32.61               7.12     
## df                                             7                      2                   2      
## p-value                                     < 2e-16                8.28e-08            0.0284    
## ICC                         0.227            0.154                  0.043               0.043    
## Observations               24,158           24,158                  24,158             24,158    
## Log Likelihood           -14,104.320      -13,196.490            -13,180.180         -13,176.620 
## Akaike Inf. Crit.        28,212.650       26,410.980              26,382.360         26,379.240  
## Bayesian Inf. Crit.      28,228.830       26,483.810              26,471.380         26,484.440  
## =================================================================================================
## Note:                                                            * p<0.05; ** p<0.01; *** p<0.001
interaction_models_table <- stargazer(model3, model4a, model4b, 
          type = "text",
          title = "Interaction Models for Transgender Document Change Support",
          column.labels = model_names[4:6],
          digits = 3,
          star.cutoffs = c(0.05, 0.01, 0.001),
          intercept.bottom = FALSE,
          single.row = FALSE,
          model.numbers = FALSE,
          notes.append = FALSE,
          notes = "* p<0.05; ** p<0.01; *** p<0.001",
          add.lines = list(
            c("AIC", round(all_models_stats$AIC[4:6], 1)),
            c("BIC", round(all_models_stats$BIC[4:6], 1)),
            c("Log Likelihood", round(all_models_stats$LogLik[4:6], 1)),
            c("Chi-square", c("", round(all_models_stats$ChiSq[5:6], 2))),
            c("df", c("", all_models_stats$Df[5:6])),
            c("p-value", c("", format.pval(all_models_stats$p[5:6], digits = 3))),
            c("ICC", round(all_models_stats$ICC[4:6], 3))))
## 
## Interaction Models for Transgender Document Change Support
## =========================================================================================
##                                                  Dependent variable:                     
##                             -------------------------------------------------------------
##                                                         qc19                             
##                             Random slopes Interaction (Rainbow) Interaction (Egalitarian)
## -----------------------------------------------------------------------------------------
## Constant                      -1.750***         -1.750***               -1.749***        
##                                (0.133)           (0.133)                 (0.133)         
##                                                                                          
## age_z                         -0.084***         -0.084***               -0.084***        
##                                (0.016)           (0.016)                 (0.016)         
##                                                                                          
## gender                        0.386***          0.386***                0.386***         
##                                (0.031)           (0.031)                 (0.031)         
##                                                                                          
## religion_z                    0.102***          0.102***                0.102***         
##                                (0.018)           (0.018)                 (0.018)         
##                                                                                          
## political_ideology_z          -0.131***         -0.131***               -0.132***        
##                                (0.016)           (0.016)                 (0.016)         
##                                                                                          
## lgb_friends                   0.873***          0.873***                0.875***         
##                                (0.050)           (0.050)                 (0.050)         
##                                                                                          
## trans_friends                 0.404***          0.404***                0.403***         
##                                (0.060)           (0.060)                 (0.060)         
##                                                                                          
## respondent_cooperation_z      -0.270***         -0.270***               -0.270***        
##                                (0.016)           (0.016)                 (0.016)         
##                                                                                          
## rainbow_score_z               0.362***           0.363**                0.362***         
##                                (0.090)           (0.114)                 (0.090)         
##                                                                                          
## v2x_egaldem_z                 0.480***          0.480***                0.535***         
##                                (0.094)           (0.094)                 (0.118)         
##                                                                                          
## lgb_friends:rainbow_score_z                      -0.001                                  
##                                                  (0.048)                                 
##                                                                                          
## lgb_friends:v2x_egaldem_z                                                -0.038          
##                                                                          (0.050)         
##                                                                                          
## -----------------------------------------------------------------------------------------
## AIC                            26379.2           26381.2                 26380.7         
## BIC                            26484.4           26494.5                  26494          
## Log Likelihood                -13176.6          -13176.6                -13176.3         
## Chi-square                                          0                     0.57           
## df                                                  1                       0            
## p-value                                           0.99                     NA            
## ICC                             0.043             0.043                   0.043          
## Observations                   24,158            24,158                  24,158          
## Log Likelihood               -13,176.620       -13,176.620             -13,176.330       
## Akaike Inf. Crit.            26,379.240        26,381.240              26,380.670        
## Bayesian Inf. Crit.          26,484.440        26,494.530              26,493.960        
## =========================================================================================
## Note:                                                    * p<0.05; ** p<0.01; *** p<0.001
# Get anova results for interaction models
anova_results_interactions <- anova(model3, model4a, model4b)

stargazer(model3, model4a, model4b, 
          type = "html",
          title = "Interaction Models for Transgender Document Change Support",
          column.labels = c("Random Slopes", "Rainbow Interaction", "Egalitarian Interaction"),
          covariate.labels = c("Age (z-score)", "Gender", "Religion (z-score)", 
                              "Political ideology (z-score)", "Has LGB friends", 
                              "Has transgender friends", "Respondent cooperation (z-score)",
                              "Rainbow score (z-score)", "Egalitarian democracy (z-score)",
                              "LGB friends × Rainbow score", 
                              "LGB friends × Egalitarian democracy"),
          digits = 3,
          star.cutoffs = c(0.05, 0.01, 0.001),
          add.lines = list(
            c("Chi-square", "", 
              round(anova_results_interactions$Chisq[2], 2), 
              round(anova_results_interactions$Chisq[3], 2)),
            c("df", "", 
              anova_results_interactions$Df[2], 
              anova_results_interactions$Df[3]),
            c("p-value", "", 
              format.pval(anova_results_interactions$`Pr(>Chisq)`[2], digits = 3),
              format.pval(anova_results_interactions$`Pr(>Chisq)`[3], digits = 3)),
            c("ICC", 
              round(performance::icc(model3)$ICC_conditional, 3),
              round(performance::icc(model4a)$ICC_conditional, 3),
              round(performance::icc(model4b)$ICC_conditional, 3))),
          notes = "* p<0.05; ** p<0.01; *** p<0.001",
          out = "interaction_models.html")
## 
## <table style="text-align:center"><caption><strong>Interaction Models for Transgender Document Change Support</strong></caption>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td colspan="3"><em>Dependent variable:</em></td></tr>
## <tr><td></td><td colspan="3" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td colspan="3">qc19</td></tr>
## <tr><td style="text-align:left"></td><td>Random Slopes</td><td>Rainbow Interaction</td><td>Egalitarian Interaction</td></tr>
## <tr><td style="text-align:left"></td><td>(1)</td><td>(2)</td><td>(3)</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Age (z-score)</td><td>-0.084<sup>***</sup></td><td>-0.084<sup>***</sup></td><td>-0.084<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.016)</td><td>(0.016)</td><td>(0.016)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Gender</td><td>0.386<sup>***</sup></td><td>0.386<sup>***</sup></td><td>0.386<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.031)</td><td>(0.031)</td><td>(0.031)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Religion (z-score)</td><td>0.102<sup>***</sup></td><td>0.102<sup>***</sup></td><td>0.102<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.018)</td><td>(0.018)</td><td>(0.018)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Political ideology (z-score)</td><td>-0.131<sup>***</sup></td><td>-0.131<sup>***</sup></td><td>-0.132<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.016)</td><td>(0.016)</td><td>(0.016)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Has LGB friends</td><td>0.873<sup>***</sup></td><td>0.873<sup>***</sup></td><td>0.875<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.050)</td><td>(0.050)</td><td>(0.050)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Has transgender friends</td><td>0.404<sup>***</sup></td><td>0.404<sup>***</sup></td><td>0.403<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.060)</td><td>(0.060)</td><td>(0.060)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Respondent cooperation (z-score)</td><td>-0.270<sup>***</sup></td><td>-0.270<sup>***</sup></td><td>-0.270<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.016)</td><td>(0.016)</td><td>(0.016)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Rainbow score (z-score)</td><td>0.362<sup>***</sup></td><td>0.363<sup>**</sup></td><td>0.362<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.090)</td><td>(0.114)</td><td>(0.090)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Egalitarian democracy (z-score)</td><td>0.480<sup>***</sup></td><td>0.480<sup>***</sup></td><td>0.535<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.094)</td><td>(0.094)</td><td>(0.118)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">LGB friends × Rainbow score</td><td></td><td>-0.001</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td>(0.048)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">LGB friends × Egalitarian democracy</td><td></td><td></td><td>-0.038</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td>(0.050)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>-1.750<sup>***</sup></td><td>-1.750<sup>***</sup></td><td>-1.749<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.133)</td><td>(0.133)</td><td>(0.133)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Chi-square</td><td></td><td>0</td><td>0.57</td></tr>
## <tr><td style="text-align:left">df</td><td></td><td>1</td><td>0</td></tr>
## <tr><td style="text-align:left">p-value</td><td></td><td>0.99</td><td>NA</td></tr>
## <tr><td style="text-align:left">ICC</td><td>0.043</td><td>0.043</td><td>0.043</td></tr>
## <tr><td style="text-align:left">Observations</td><td>24,158</td><td>24,158</td><td>24,158</td></tr>
## <tr><td style="text-align:left">Log Likelihood</td><td>-13,176.620</td><td>-13,176.620</td><td>-13,176.330</td></tr>
## <tr><td style="text-align:left">Akaike Inf. Crit.</td><td>26,379.240</td><td>26,381.240</td><td>26,380.670</td></tr>
## <tr><td style="text-align:left">Bayesian Inf. Crit.</td><td>26,484.440</td><td>26,494.530</td><td>26,493.960</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td colspan="3" style="text-align:right"><sup>*</sup>p<0.05; <sup>**</sup>p<0.01; <sup>***</sup>p<0.001</td></tr>
## <tr><td style="text-align:left"></td><td colspan="3" style="text-align:right">* p<0.05; ** p<0.01; *** p<0.001</td></tr>
## </table>
# Get anova results for all models
anova_results_all <- anova(null_model, model1, model_equality, model3, model4a, model4b)

stargazer(null_model, model1, model_equality, model3, model4a, model4b, 
          type = "text",
          title = "Multilevel Models for Transgender Document Change Support",
          column.labels = c("Null", "Individual", "Country", "Random Slopes", 
                           "Rainbow Int.", "Egalitarian Int."),
          covariate.labels = c("Age (z-score)", "Gender", "Religion (z-score)", 
                              "Political ideology (z-score)", "Has LGB friends", 
                              "Has transgender friends", "Respondent cooperation (z-score)",
                              "Rainbow score (z-score)", "Egalitarian democracy (z-score)",
                              "LGB friends × Rainbow score", 
                              "LGB friends × Egalitarian democracy"),
          digits = 3,
          star.cutoffs = c(0.05, 0.01, 0.001),
          add.lines = list(
            c("Chi-square", "", 
              round(anova_results_all$Chisq[2], 2), 
              round(anova_results_all$Chisq[3], 2),
              round(anova_results_all$Chisq[4], 2),
              round(anova_results_all$Chisq[5], 2),
              round(anova_results_all$Chisq[6], 2)),
            c("df", "", 
              anova_results_all$Df[2], 
              anova_results_all$Df[3],
              anova_results_all$Df[4],
              anova_results_all$Df[5],
              anova_results_all$Df[6]),
            c("p-value", "", 
              format.pval(anova_results_all$`Pr(>Chisq)`[2], digits = 3),
              format.pval(anova_results_all$`Pr(>Chisq)`[3], digits = 3),
              format.pval(anova_results_all$`Pr(>Chisq)`[4], digits = 3),
              format.pval(anova_results_all$`Pr(>Chisq)`[5], digits = 3),
              format.pval(anova_results_all$`Pr(>Chisq)`[6], digits = 3)),
            c("ICC", 
              round(performance::icc(null_model)$ICC_conditional, 3),
              round(performance::icc(model1)$ICC_conditional, 3),
              round(performance::icc(model_equality)$ICC_conditional, 3),
              round(performance::icc(model3)$ICC_conditional, 3),
              round(performance::icc(model4a)$ICC_conditional, 3),
              round(performance::icc(model4b)$ICC_conditional, 3))),
          notes = "* p<0.05; ** p<0.01; *** p<0.001",
          font.size = "small",  # Use smaller font to help fit the table
          out = "all_models.txt")
## 
## Multilevel Models for Transgender Document Change Support
## ===================================================================================================================
##                                                                   Dependent variable:                              
##                                     -------------------------------------------------------------------------------
##                                                                          qc19                                      
##                                        Null     Individual    Country   Random Slopes Rainbow Int. Egalitarian Int.
##                                         (1)         (2)         (3)          (4)          (5)            (6)       
## -------------------------------------------------------------------------------------------------------------------
## Age (z-score)                                    -0.082***   -0.083***    -0.084***    -0.084***      -0.084***    
##                                                   (0.016)     (0.016)      (0.016)      (0.016)        (0.016)     
##                                                                                                                    
## Gender                                           0.383***    0.384***     0.386***      0.386***       0.386***    
##                                                   (0.031)     (0.031)      (0.031)      (0.031)        (0.031)     
##                                                                                                                    
## Religion (z-score)                               0.105***    0.102***     0.102***      0.102***       0.102***    
##                                                   (0.018)     (0.018)      (0.018)      (0.018)        (0.018)     
##                                                                                                                    
## Political ideology (z-score)                     -0.133***   -0.132***    -0.131***    -0.131***      -0.132***    
##                                                   (0.016)     (0.016)      (0.016)      (0.016)        (0.016)     
##                                                                                                                    
## Has LGB friends                                  0.882***    0.876***     0.873***      0.873***       0.875***    
##                                                   (0.036)     (0.036)      (0.050)      (0.050)        (0.050)     
##                                                                                                                    
## Has transgender friends                          0.406***    0.407***     0.404***      0.404***       0.403***    
##                                                   (0.060)     (0.060)      (0.060)      (0.060)        (0.060)     
##                                                                                                                    
## Respondent cooperation (z-score)                 -0.271***   -0.269***    -0.270***    -0.270***      -0.270***    
##                                                   (0.016)     (0.016)      (0.016)      (0.016)        (0.016)     
##                                                                                                                    
## Rainbow score (z-score)                                      0.354***     0.362***      0.363**        0.362***    
##                                                               (0.091)      (0.090)      (0.114)        (0.090)     
##                                                                                                                    
## Egalitarian democracy (z-score)                              0.480***     0.480***      0.480***       0.535***    
##                                                               (0.095)      (0.094)      (0.094)        (0.118)     
##                                                                                                                    
## LGB friends × Rainbow score                                                              -0.001                    
##                                                                                         (0.048)                    
##                                                                                                                    
## LGB friends × Egalitarian democracy                                                                     -0.038     
##                                                                                                        (0.050)     
##                                                                                                                    
## Constant                              0.449*     -1.791***   -1.763***    -1.750***    -1.750***      -1.749***    
##                                       (0.186)     (0.180)     (0.122)      (0.133)      (0.133)        (0.133)     
##                                                                                                                    
## -------------------------------------------------------------------------------------------------------------------
## Chi-square                                        1815.67      32.61        7.12           0             0.57      
## df                                                   7           2            2            1              0        
## p-value                                           <2e-16     8.28e-08      0.0284         0.99            NA       
## ICC                                    0.227       0.154       0.043        0.043        0.043          0.043      
## Observations                          24,158      24,158      24,158       24,158        24,158         24,158     
## Log Likelihood                      -14,104.320 -13,196.490 -13,180.180  -13,176.620  -13,176.620    -13,176.330   
## Akaike Inf. Crit.                   28,212.650  26,410.980  26,382.360   26,379.240    26,381.240     26,380.670   
## Bayesian Inf. Crit.                 28,228.830  26,483.810  26,471.380   26,484.440    26,494.530     26,493.960   
## ===================================================================================================================
## Note:                                                                                 *p<0.05; **p<0.01; ***p<0.001
##                                                                                    * p<0.05; ** p<0.01; *** p<0.001

Model 3 (random slopes for LGB friends) is the best model, as it significantly improves fit over simpler models (χ² = 7.92, p = 0.019) and achieves the lowest AIC (26472.5). Subsequent interaction models fail to improve fit (p = 0.90, p = 0.41) and slightly increase AIC, confirming Model 3 as the most robust choice.

7. Variance explained

# For logistic regression, the level-1 variance is fixed at π²/3
pi_squared_by_3 <- (pi^2)/3  # approximately 3.29

# Get variance components from null model
var_null <- as.data.frame(VarCorr(null_model))
between_var_null <- var_null$vcov[1]  # Between-country variance
within_var_null <- pi_squared_by_3    # Fixed residual variance for logistic models

# Get variance components from final model (model3)
var_model3 <- as.data.frame(VarCorr(model3))
between_var_model3 <- var_model3$vcov[1]  # Between-country variance
within_var_model3 <- pi_squared_by_3      # Still fixed for the final model

# Calculate proportion of between-country variance explained
between_var_explained <- (between_var_null - between_var_model3) / between_var_null

# For binomial models, we can't directly calculate within-country variance explained
# We can use an approximation based on the R² measure for GLMMs
# Calculate total variance in both models
total_var_null <- between_var_null + within_var_null
total_var_model3 <- between_var_model3 + within_var_model3

# Calculate proportion of total variance explained
total_var_explained <- 1 - (total_var_model3 / total_var_null)

# Alternatively, use MuMIn package for R² calculation
library(MuMIn)
r2_model3 <- r.squaredGLMM(model3)
## Warning: the null model is only correct if all the variables it uses are identical 
## to those used in fitting the original model.
# This returns two values: R²m (marginal - fixed effects only) and R²c (conditional - fixed + random effects)

cat("Proportion of between-country variance explained:", round(between_var_explained * 100, 1), "%\n")
## Proportion of between-country variance explained: 70.8 %
cat("Proportion of total variance explained:", round(total_var_explained * 100, 1), "%\n")
## Proportion of total variance explained: 16.1 %
cat("R² marginal (fixed effects only):", round(r2_model3[1], 3), "\n")
## R² marginal (fixed effects only): 0.281
cat("R² conditional (fixed + random):", round(r2_model3[2], 3), "\n")
## R² conditional (fixed + random): 0.241

8. Vizualisations

library(ggplot2)
library(sjPlot)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
# 1. Plot fixed effects with error bars
p1 <- plot_model(model3, sort.est = TRUE, show.values = TRUE, value.offset = 0.3) +
  theme_bw() +
  labs(title = "Factors Influencing Support for Transgender Document Change",
       y = "Log-Odds (95% CI)")

# 2. Create a plot showing country variation
# Extract random effects for each country
re <- ranef(model3)$country_name
re_df <- data.frame(
  country = rownames(re),
  intercept = re$`(Intercept)`,
  lgb_slope = re$lgb_friends
)

# Sort by intercept
re_df <- re_df[order(re_df$intercept), ]
re_df$country <- factor(re_df$country, levels = re_df$country)

# Create the plot
p2 <- ggplot(re_df, aes(x = country, y = intercept)) +
  geom_point() +
  geom_errorbar(aes(ymin = intercept - 1.96*attr(re, "postVar")[1,1,]^0.5, 
                    ymax = intercept + 1.96*attr(re, "postVar")[1,1,]^0.5), 
                width = 0.2) +
  coord_flip() +
  theme_bw() +
  labs(title = "Country Differences in Support for Transgender Rights",
       subtitle = "Random intercepts with 95% confidence intervals",
       x = "",
       y = "Random Intercept")

# 3. Create effect plots for key predictors
# For Rainbow Score
p3 <- plot_model(model3, type = "pred", terms = "rainbow_score_z [-2:2]", 
                title = "Effect of Rainbow Score on Support Level", 
                axis.title = c("Rainbow Score (z-score)", "Probability of Support")) +
  theme_bw()
## You are calculating adjusted predictions on the population-level (i.e.
##   `type = "fixed"`) for a *generalized* linear mixed model.
##   This may produce biased estimates due to Jensen's inequality. Consider
##   setting `bias_correction = TRUE` to correct for this bias.
##   See also the documentation of the `bias_correction` argument.
# For Egalitarian Democracy
p4 <- plot_model(model3, type = "pred", terms = "v2x_egaldem_z [-2:2]", 
                title = "Effect of Egalitarian Democracy on Support Level",
                axis.title = c("Egalitarian Democracy (z-score)", "Probability of Support")) +
  theme_bw()

## Threshold plot
# (1) for religion
# simple approach to calculate thresholds
thresholds <- data.frame(
  country = unique(df_reduced$country_name),
  threshold = NA)

# Extract fixed effects
fixed_effects <- fixef(model3)
random_effects <- ranef(model3)$country_name

# Calculate threshold for each country based on model coefficients
for (i in 1:nrow(thresholds)) {
  country_i <- thresholds$country[i]
  
  # Get country-specific intercept
  intercept <- fixed_effects["(Intercept)"] + random_effects[country_i, "(Intercept)"]
  
  # Get religiosity coefficient
  relig_coef <- fixed_effects["religion_z"]
  
  # Calculate other effects (at reference/mean values)
  other_effects <- fixed_effects["lgb_friends"] * 2  # Has LGB friends
  if ("trans_friends" %in% names(fixed_effects))
    other_effects <- other_effects + fixed_effects["trans_friends"] * 1.5
  
  # Calculate religiosity threshold where log-odds = 0 (probability = 0.5)
  # Solve: intercept + relig_coef * threshold + other_effects = 0
  thresholds$threshold[i] <- -(intercept + other_effects) / relig_coef
}

# lot the thresholds
p5 <- ggplot(thresholds, aes(x = reorder(country, -threshold), y = threshold)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Religiosity Threshold for Supporting Transgender Rights by Country",
       subtitle = "Religiosity z-score at which support probability crosses 50%",
       x = "",
       y = "Religiosity Threshold (z-score)")

# (2) for political ideology
# Calculate political ideology thresholds
pol_thresholds <- data.frame(
  country = unique(df_reduced$country_name),
  threshold = NA
)

# Extract fixed effects
fixed_effects <- fixef(model3)
random_effects <- ranef(model3)$country_name

# Calculate threshold for each country based on model coefficients
for (i in 1:nrow(pol_thresholds)) {
  country_i <- pol_thresholds$country[i]
  
  # Get country-specific intercept
  intercept <- fixed_effects["(Intercept)"] + random_effects[country_i, "(Intercept)"]
  
  # Get political ideology coefficient
  pol_coef <- fixed_effects["political_ideology_z"]
  
  # Calculate other effects (at reference/mean values)
  other_effects <- fixed_effects["lgb_friends"] * 2 + # Has LGB friends
                  fixed_effects["trans_friends"] * 1.5 +
                  fixed_effects["religion_z"] * 0  # Set religion to mean
  
  # Calculate ideology threshold where log-odds = 0 (probability = 0.5)
  pol_thresholds$threshold[i] <- -(intercept + other_effects) / pol_coef
}

# Plot the thresholds
p6 <- ggplot(pol_thresholds, aes(x = reorder(country, -threshold), y = threshold)) +
  geom_bar(stat = "identity", fill = "darkred") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Political Ideology Threshold for Supporting Transgender Rights by Country",
       subtitle = "Political ideology z-score at which support probability crosses 50%",
       x = "",
       y = "Political Ideology Threshold (z-score)")

# Display plots
p1

p2

p3

p4

p5

p6